feat: validate that prompts do not have nonfinite values

tjohnson31415 · njhill · njhill · commit 2342f1683880 · 2023-10-13T15:15:25.000-07:00
Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;

Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/server/tests/test_prompt_cache.py b/server/tests/test_prompt_cache.py
@@ -20,7 +20,7 @@ def temp_prompt_cache():
     return prompt_cache.PrefixCache(
         device=DEVICE,
         dtype=torch.float32,
-        max_length=256,
+        max_length=8,
         encoder_decoder=False,
         decoder_start_tok_embedding=None
     )
@@ -264,3 +264,72 @@ def test_get_cache_len(mock_load_tensors, temp_prompt_cache):
     temp_prompt_cache.get("prompt1")
     temp_prompt_cache.get("prompt2")
     assert len(temp_prompt_cache) == 2
+
+### Test cases for invalid prompts
+@patch("pathlib.Path.is_file")
+def test_prompt_not_exist(mock_is_file, temp_prompt_cache):
+    mock_is_file.return_value = False
+    with pytest.raises(Exception):
+        temp_prompt_cache.get("bad_prompt")
+    assert len(temp_prompt_cache) == 0
+
+@patch("torch.load")
+@patch("pathlib.Path.is_file")
+def test_prompt_with_wrong_dims(mock_is_file, mock_torch_load, temp_prompt_cache):
+    mock_is_file.return_value = True
+
+    # one dimension is not enough
+    mock_torch_load.return_value = torch.ones((3))
+    with pytest.raises(Exception):
+        temp_prompt_cache.get("bad_prompt")
+    assert len(temp_prompt_cache) == 0
+
+    # three dimensions is too many
+    mock_torch_load.return_value = torch.ones((3, 3, 3))
+    with pytest.raises(Exception):
+        temp_prompt_cache.get("bad_prompt")
+    assert len(temp_prompt_cache) == 0
+
+@patch("torch.load")
+@patch("pathlib.Path.is_file")
+def test_prompt_too_many_virtual_tokens(mock_is_file, mock_torch_load, temp_prompt_cache):
+    mock_is_file.return_value = True
+
+    mock_torch_load.return_value = torch.ones((9,16))
+    with pytest.raises(Exception):
+        temp_prompt_cache.get("bad_prompt")
+    assert len(temp_prompt_cache) == 0
+
+@patch("torch.load")
+@patch("pathlib.Path.is_file")
+def test_prompt_wrong_embed_size(mock_is_file, mock_torch_load, temp_prompt_cache):
+    mock_is_file.return_value = True
+    # set embed_size to 16
+    temp_prompt_cache.embed_size = 16
+    mock_torch_load.return_value = torch.ones((1,15))
+    with pytest.raises(Exception):
+        temp_prompt_cache.get("bad_prompt")
+    assert len(temp_prompt_cache) == 0
+
+@patch("torch.load")
+@patch("pathlib.Path.is_file")
+def test_prompt_with_infinite_after_conversion(mock_is_file, mock_torch_load, temp_prompt_cache):
+    mock_is_file.return_value = True
+    bad_tensor = torch.ones((3,3), dtype=torch.float64)
+    bad_tensor[1, 1] = torch.finfo(torch.float64).max
+    mock_torch_load.return_value = bad_tensor
+    with pytest.raises(Exception) as e:
+        temp_prompt_cache.get("bad_prompt")
+    assert e.match("torch.float64 to torch.float32")
+    assert len(temp_prompt_cache) == 0
+
+@patch("torch.load")
+@patch("pathlib.Path.is_file")
+def test_prompt_with_nan(mock_is_file, mock_torch_load, temp_prompt_cache):
+    mock_is_file.return_value = True
+    bad_tensor = torch.ones((3,3), dtype=torch.float16)
+    bad_tensor[1, 1] = torch.nan
+    mock_torch_load.return_value = bad_tensor
+    with pytest.raises(Exception):
+        temp_prompt_cache.get("bad_prompt")
+    assert len(temp_prompt_cache) == 0
diff --git a/server/text_generation_server/prompt_cache.py b/server/text_generation_server/prompt_cache.py
@@ -207,20 +207,20 @@ def _load_embedding_tensors(self, prefix_id: str) -> Union[torch.Tensor, Tuple[t
             Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
                 Loaded encoder / decoder prompt tensor for the model under consideration.
         """
-        decoder_prefix = self._load_embedding_tensor(prefix_id, "decoder.pt")
+        decoder_prefix = self._load_embedding_tensor(prefix_id, "decoder.pt", dtype=self.dtype)
         # For encoder-decoder we store a tuple of (encoder_prefix, decoder_prefix),
         # at least one must be non-None
         if self.is_encoder_decoder:
-            encoder_prefix = self._load_embedding_tensor(prefix_id, "encoder.pt")
+            encoder_prefix = self._load_embedding_tensor(prefix_id, "encoder.pt", dtype=self.dtype)
             if decoder_prefix is None:
                 if encoder_prefix is None:
                     raise PrefixNotFound(f"Prefix id {prefix_id} not found")
             else:
                 # TODO confirm this cat is correct
                 decoder_prefix = torch.cat((decoder_prefix, self.decoder_start_tok_embedding))
-                decoder_prefix = decoder_prefix.to(self.dtype).to(self.device, non_blocking=True)
+                decoder_prefix = decoder_prefix.to(self.device, non_blocking=True)
             if encoder_prefix is not None:
-                encoder_prefix = encoder_prefix.to(self.dtype).to(self.device, non_blocking=True)
+                encoder_prefix = encoder_prefix.to(self.device, non_blocking=True)
             prefix = encoder_prefix, decoder_prefix
         # For decoder-only we store just the decoder prefix
         elif decoder_prefix is None:
@@ -229,7 +229,7 @@ def _load_embedding_tensors(self, prefix_id: str) -> Union[torch.Tensor, Tuple[t
             prefix = decoder_prefix.to(self.dtype).to(self.device, non_blocking=True)
         return prefix
 
-    def _load_embedding_tensor(self, prefix_id: str, filename: str) -> torch.Tensor:
+    def _load_embedding_tensor(self, prefix_id: str, filename: str, dtype: torch.dtype) -> torch.Tensor:
         """Load an embedding tensor from a single file.
 
         Args:
@@ -264,9 +264,18 @@ def _load_embedding_tensor(self, prefix_id: str, filename: str) -> torch.Tensor:
             raise Exception(
                 f"Prefix embedding tensor dim {prefix.shape[1]} does not match model ({self.embed_size})"
             )
-
-        prefix.requires_grad = False
-        return prefix
+        # convert to the desired dtype
+        converted_prefix = prefix.to(dtype)
+        # detect if we have non-finite elements after the conversion that will
+        # cause problems for inference
+        if not converted_prefix.isfinite().all():
+            # check if the problem was in the pre-converted tensor
+            if not prefix.isfinite().all():
+                raise Exception(f"Prefix contains non-finite elements")
+            raise Exception(f"Prefix contains non-finite elements after conversion from {prefix.dtype} to {dtype}")
+
+        converted_prefix.requires_grad = False
+        return converted_prefix
 
     def _add_prefix_id_to_cache(
         self,