opendatahub-io
diff --git a/‎integration_tests/prompt_prefixes/tinyllama_peft_adapter/adapter_model.safetensors
2.12 KB b/‎integration_tests/prompt_prefixes/tinyllama_peft_adapter/adapter_model.safetensors
2.12 KB
diff --git a/‎integration_tests/prompt_prefixes/tinyllama_peft_adapter_raw/adapter_model.bin
3.24 KB b/‎integration_tests/prompt_prefixes/tinyllama_peft_adapter_raw/adapter_model.bin
3.24 KB
diff --git a/‎integration_tests/test_cases_tinyllama.yaml
Lines changed: 34 additions & 0 deletions b/‎integration_tests/test_cases_tinyllama.yaml
Lines changed: 34 additions & 0 deletions
diff --git a/‎server/tests/test_prompt_cache.py
Lines changed: 65 additions & 2 deletions b/‎server/tests/test_prompt_cache.py
Lines changed: 65 additions & 2 deletions
diff --git a/‎server/text_generation_server/prompt_cache.py
Lines changed: 121 additions & 44 deletions b/‎server/text_generation_server/prompt_cache.py
Lines changed: 121 additions & 44 deletions
@@ -175,6 +175,40 @@
       stopReason: MAX_TOKENS
       text: ' Once he can go to the park and play with his friends.'
 
+# Prompt loaded from peft adapter
+- name: Greedy with tuned peft adapter prefix
+  request:
+    prefixId: tinyllama_peft_adapter
+    params:
+      method: GREEDY
+      stopping:
+        maxNewTokens: 13
+    requests:
+    - {"text": ""}
+  response:
+    responses:
+    - generatedTokenCount: 13
+      inputTokenCount: 1
+      stopReason: MAX_TOKENS
+      text: ' Once upon a time, there was a little boy named Tim.'
+
+# Prompt loaded from peft adapter saved in a raw .bin file
+- name: Greedy with tuned peft adapter prefix in raw .bin format
+  request:
+    prefixId: tinyllama_peft_adapter_raw
+    params:
+      method: GREEDY
+      stopping:
+        maxNewTokens: 13
+    requests:
+    - {"text": ""}
+  response:
+    responses:
+    - generatedTokenCount: 13
+      inputTokenCount: 1
+      stopReason: MAX_TOKENS
+      text: ' Once upon a time, there was a little boy named Tim.'
+
 # Prompt prefix with truncation
 - name: Greedy with tuned prompt prefix with truncation
   request:
 
@@ -2,9 +2,13 @@
 it does LRU eviction in a thread safe way correctly.
 """
 import gc
+import os
+from pathlib import Path
+
 import pytest
 from unittest.mock import patch
 import torch
+import safetensors.torch
 from threading import Lock
 from text_generation_server import prompt_cache
 
@@ -14,6 +18,32 @@
 else:
     DEVICE = None
 
+TESTS_DIR = os.path.dirname(__file__)
+REPO_ROOT = os.path.dirname(os.path.dirname(TESTS_DIR))
+INTEGRATION_TESTS_DIR = os.path.join(REPO_ROOT, "integration_tests")
+
+
+@pytest.fixture()
+def temp_prompt_store():
+    with patch("text_generation_server.prompt_cache.PREFIX_STORE_PATH", Path(os.path.join(INTEGRATION_TESTS_DIR, "prompt_prefixes"))):
+        yield
+
+
+@pytest.fixture()
+def tiny_starcoder_decoder_prompt(temp_prompt_store):
+    return "tiny_starcoder"
+
+
+@pytest.fixture()
+def tiny_raw_llama_peft_adapter_prompt(temp_prompt_store):
+    return "tinyllama_peft_adapter_raw"
+
+
+@pytest.fixture()
+def tiny_llama_peft_adapter_prompt(temp_prompt_store):
+    return "tinyllama_peft_adapter"
+
+
 @pytest.fixture()
 def temp_prompt_cache_enc_dec_meta():
     """Build an empty prompt cache for an encoder-decoder model with the 'meta'
@@ -285,11 +315,11 @@ def test_get_cache_len(mock_load_tensors, temp_prompt_cache):
 
 ### Test code paths for encoder decoder model
 # TODO: add more tests here!
-@patch("text_generation_server.prompt_cache.PrefixCache._load_embedding_tensor")
+@patch("text_generation_server.prompt_cache.PrefixCache._load_torch_file")
 def test_prompt_model_device_diff(mock_load, temp_prompt_cache_enc_dec_meta):
     # create prefix tensor on CPU which should be converted to the 'meta' device
     # before the decoder_start_tok_embedding is added to it
-    mock_load.return_value = torch.ones((3,8), device='cpu')
+    mock_load.return_value = torch.ones((4,8), device='cpu')
     temp_prompt_cache_enc_dec_meta.get("bad_prompt")
 
 ### Test cases for invalid prompts
@@ -360,3 +390,36 @@ def test_prompt_with_nan(mock_is_file, mock_torch_load, temp_prompt_cache):
     with pytest.raises(Exception):
         temp_prompt_cache.get("bad_prompt")
     assert len(temp_prompt_cache) == 0
+
+
+def test_prompt_cache_decoder_only_load(temp_prompt_cache, tiny_starcoder_decoder_prompt):
+    """Simple test that we can load a prompt with a decoder.pt file"""
+    # The cache should load this without raising
+    prompt = temp_prompt_cache.get(tiny_starcoder_decoder_prompt)
+
+    # Assert this is the same tensor that's in decoder.pt
+    decoder_pt_path = os.path.join(prompt_cache.PREFIX_STORE_PATH, tiny_starcoder_decoder_prompt, "decoder.pt")
+    decoder = torch.load(decoder_pt_path)
+    assert decoder.equal(prompt)
+
+
+def test_prompt_cache_peft_decoder_load(temp_prompt_cache, tiny_raw_llama_peft_adapter_prompt):
+    """Simple test that we can load a prompt for a decoder-only model saved with PEFT directly in adapter_model.bin format"""
+    # The cache should load this without raising
+    prompt = temp_prompt_cache.get(tiny_raw_llama_peft_adapter_prompt)
+
+    # Assert this is the same tensor that's in adapter_model.bin
+    adapter_model_path = os.path.join(prompt_cache.PREFIX_STORE_PATH, tiny_raw_llama_peft_adapter_prompt, "adapter_model.bin")
+    adapter_model = torch.load(adapter_model_path, map_location=torch.device('cpu'))
+    assert adapter_model["prompt_embeddings"].equal(prompt)
+
+
+def test_prompt_cache_safetensors_load(temp_prompt_cache, tiny_llama_peft_adapter_prompt):
+    """Simple test that we can load a prompt for a decoder-only model saved with PEFT directly in adapter_model.safetensors format"""
+    # The cache should load this without raising
+    prompt = temp_prompt_cache.get(tiny_llama_peft_adapter_prompt)
+
+    # Assert this is the same tensor that's in adapter_model.safetensors
+    adapter_model_path = os.path.join(prompt_cache.PREFIX_STORE_PATH, tiny_llama_peft_adapter_prompt, "adapter_model.safetensors")
+    adapter_model = safetensors.torch.load_file(adapter_model_path)
+    assert adapter_model["prompt_embeddings"].equal(prompt)
@@ -5,6 +5,7 @@
 import re
 import threading
 from typing import Dict, List, Union, Tuple, Optional
+from safetensors.torch import load_file as safe_load_file
 
 import torch
 
@@ -191,11 +192,61 @@ def get(self, prefix_id: str) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.T
             cache_node = self._get_from_cache(prefix_id)
         if cache_node is None:
             # Release the lock & load the tensors
-            prefix = self._load_embedding_tensors(prefix_id)
+            self._reject_bad_prefix_ids(prefix_id)
+            if self._is_peft_prefix(prefix_id):
+                prefix = self._load_embedding_tensors_peft(prefix_id)
+            else:
+                prefix = self._load_embedding_tensors(prefix_id)
             # Relock & add the newly loaded tensor to the cache
             cache_node = self._add_prefix_id_to_cache(prefix_id, prefix)
         return cache_node.prompt
 
+    @staticmethod
+    def _reject_bad_prefix_ids(prefix_id: str) -> None:
+        """Raises if the prefix does not exist, has an invalid name, or attempted to
+        access files outside the prefix cache"""
+        if not VALID_PREFIX_ID_PATTERN.fullmatch(prefix_id):
+            raise Exception(f"Invalid prefix id {prefix_id}, must contain only alphanumeric, _ and - and /")
+        prefix_dir_path = PREFIX_STORE_PATH / prefix_id
+        # Check for path traversal
+        if not os.path.normpath(prefix_dir_path).startswith(str(PREFIX_STORE_PATH) + "/"):
+            raise Exception(f"Invalid prefix id {prefix_id}")
+
+    @staticmethod
+    def _is_peft_prefix(prefix_id):
+        """Returns true if the prefix was saved with peft.save_pretrained()
+            (has an adapter_model.bin file)"""
+        prefix_dir_path = PREFIX_STORE_PATH / prefix_id
+        if not os.path.isdir(prefix_dir_path):
+            return False
+        return "adapter_model" in [os.path.splitext(f)[0] for f in os.listdir(prefix_dir_path)]
+
+    def _load_embedding_tensors_peft(self, prefix_id: str) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Load prompt tensors for a peft adapter
+        """
+        if self.is_encoder_decoder:
+            raise Exception("encoder-decoder architectures not supported for peft models")
+
+        # safetensors is the default format, but users may have saved their model with
+        # safe_serialization=False to produce the .bin file instead
+        decoder_data_dict = self._load_torch_file(prefix_id, "adapter_model.safetensors")
+        if decoder_data_dict is None:
+            decoder_data_dict = self._load_torch_file(prefix_id, "adapter_model.bin")
+
+        if decoder_data_dict is None:
+            raise PrefixNotFound(f"Prefix id {prefix_id} not found")
+
+        # These files should contain dicts with a `prompt_embeddings` tensor
+        decoder_data = decoder_data_dict["prompt_embeddings"]
+        decoder_prefix = self._process_prefix_tensor(decoder_data, dtype=self.dtype)
+
+        if self.zero:
+            # Return zero prefix early before sending tensor to gpu
+            return self._zero_prefixes(decoder=decoder_prefix, encoder=None)
+
+        decoder_prefix = decoder_prefix.to(self.device, non_blocking=True)
+        return decoder_prefix
+
     def _load_embedding_tensors(self, prefix_id: str) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """Load prompt tensors corresponding to a single prefix ID to disk. The return
         value of this function should be what is returned when indexing into the cache
@@ -209,63 +260,67 @@ def _load_embedding_tensors(self, prefix_id: str) -> Union[torch.Tensor, Tuple[t
             Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
                 Loaded encoder / decoder prompt tensor for the model under consideration.
         """
-        decoder_prefix = self._load_embedding_tensor(prefix_id, "decoder.pt", dtype=self.dtype)
-        # For encoder-decoder we store a tuple of (encoder_prefix, decoder_prefix),
-        # at least one must be non-None
+        decoder_data = self._load_torch_file(prefix_id, "decoder.pt")
+        decoder_prefix = self._process_prefix_tensor(decoder_data, dtype=self.dtype)
+
+        encoder_data = self._load_torch_file(prefix_id, "encoder.pt")
+        encoder_prefix = self._process_prefix_tensor(encoder_data, dtype=self.dtype)
+
+        if decoder_prefix is None and not self.is_encoder_decoder:
+            # Must have a decoder for decoder only model
+            raise PrefixNotFound(f"Prefix id {prefix_id} not found")
+        if decoder_prefix is None and encoder_prefix is None:
+            # And either the decoder or encoder must be provided
+            raise PrefixNotFound(f"Prefix id {prefix_id} not found")
+
+        if self.zero:
+            # Return zero prefixes early before sending tensors to gpu
+            return self._zero_prefixes(encoder=encoder_prefix, decoder=decoder_prefix)
+
         if decoder_prefix is not None:
-            if self.zero is not None:
-                decoder_prefix = self.zero.expand(decoder_prefix.shape)
-            else:
-                decoder_prefix = decoder_prefix.to(self.dtype).to(self.device, non_blocking=True)
+            decoder_prefix = decoder_prefix.to(self.device, non_blocking=True)
 
+        # For encoder-decoder we store a tuple of (encoder_prefix, decoder_prefix),
         if self.is_encoder_decoder:
-            encoder_prefix = self._load_embedding_tensor(prefix_id, "encoder.pt", dtype=self.dtype)
-            if decoder_prefix is None:
-                if encoder_prefix is None:
-                    raise PrefixNotFound(f"Prefix id {prefix_id} not found")
-            else:
+            if decoder_prefix is not None:
                 # TODO confirm this cat is correct
-                if self.zero is not None:
-                    decoder_prefix = self.zero.expand(decoder_prefix.shape[0] + 1, *decoder_prefix.shape[1:])
-                else:
-                    decoder_prefix = torch.cat((decoder_prefix, self.decoder_start_tok_embedding))
+                decoder_prefix = torch.cat((decoder_prefix, self.decoder_start_tok_embedding))
             if encoder_prefix is not None:
-                if self.zero is not None:
-                    encoder_prefix = self.zero.expand(encoder_prefix.shape)
-                else:    
-                    encoder_prefix = encoder_prefix.to(self.device, non_blocking=True)
-            prefix = encoder_prefix, decoder_prefix
-        # For decoder-only we store just the decoder prefix
-        elif decoder_prefix is None:
-            raise PrefixNotFound(f"Prefix id {prefix_id} not found")
+                encoder_prefix = encoder_prefix.to(self.device, non_blocking=True)
+
+            return encoder_prefix, decoder_prefix
+
+        return decoder_prefix
+
+    @staticmethod
+    def _load_torch_file(prefix_id: str, filename: str) -> torch.Tensor | dict:
+        """Loads a file for the given prefix"""
+        prefix_path = PREFIX_STORE_PATH / prefix_id / filename
+        if not prefix_path.is_file():
+            return None
+
+        logger.info(f"Loading new prefix {prefix_id}/{filename}")
+
+        if os.path.splitext(prefix_path)[1] == ".safetensors":
+            return safe_load_file(prefix_path, device='cpu')
         else:
-            prefix = decoder_prefix
-        return prefix
+            return torch.load(prefix_path, weights_only=True, map_location=torch.device('cpu'))
 
-    def _load_embedding_tensor(self, prefix_id: str, filename: str, dtype: torch.dtype) -> torch.Tensor:
-        """Load an embedding tensor from a single file.
+    def _process_prefix_tensor(self, prefix: Optional[torch.Tensor], dtype: torch.dtype) -> Optional[torch.Tensor]:
+        """Convert a prefix tensor to the correct dtype and run some validation checks.
 
         Args:
-            prefix_id: str
-                Name of the file that we want to load a torch tensor from.
-            filename: str
-                Name of the file to be loaded.
+            prefix: torch.Tensor
+                A prefix tensor loaded from a file.
+            dtype: torch.dtype
+                The desired dtype of the final prefix tensor.
 
         Returns:
             torch.Tensor
-                Tensor object corresponding to loaded prompt.
+                A Tensor object corresponding to loaded prompt.
         """
-        if not VALID_PREFIX_ID_PATTERN.fullmatch(prefix_id):
-            raise Exception(f"Invalid prefix id {prefix_id}, must contain only alphanumeric, _ and - and /")
-        prefix_path = PREFIX_STORE_PATH / prefix_id / filename
-        # Check for path traversal
-        if not os.path.normpath(prefix_path).startswith(str(PREFIX_STORE_PATH) + "/"):
-            raise Exception(f"Invalid prefix id {prefix_id}")
-        if not prefix_path.is_file():
+        if prefix is None:
             return None
-
-        logger.info(f"Loading new prefix {prefix_id}/{filename}")
-        prefix = torch.load(prefix_path, weights_only=True, map_location=torch.device('cpu'))
         # Verify that it's a tensor of the correct shape
         if not torch.is_tensor(prefix) or len(prefix.shape) != 2:
             raise Exception(f"Invalid prefix embedding tensor")
@@ -290,6 +345,28 @@ def _load_embedding_tensor(self, prefix_id: str, filename: str, dtype: torch.dty
         converted_prefix.requires_grad = False
         return converted_prefix
 
+    def _zero_prefixes(
+        self,
+        encoder: Optional[torch.Tensor],
+        decoder: Optional[torch.Tensor]
+    ) -> Optional[torch.Tensor] | Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """If the return_zero flag is set, we replace the encoder and decoder prefixes
+         with zero tensors instead"""
+        if encoder is not None:
+            encoder = self.zero.expand(encoder.shape)
+
+        if self.is_encoder_decoder:
+            if decoder is not None:
+                # For encoder-decoder models we need an extra column on the decoder to account for
+                # the decoder_start_tok_embedding
+                decoder = self.zero.expand(decoder.shape[0] + 1, *decoder.shape[1:])
+            return encoder, decoder
+
+        if decoder is not None:
+            decoder = self.zero.expand(decoder.shape)
+
+        return decoder
+
     def _add_prefix_id_to_cache(
         self,
         prefix_id: str,