Include prompt prefix_id in per-request logs, log metadata when loading

njhill · njhill · commit d19245ae58d5 · 2023-10-13T15:15:25.000-07:00
Log the number of virtual tokens and size in memory when loading prompt prefixes.
diff --git a/router/src/grpc_server.rs b/router/src/grpc_server.rs
@@ -90,6 +90,7 @@ impl GenerationService for GenerationServicer {
         skip_all,
         fields(
             input=?request.get_ref().requests.iter().map(|r| truncate(&r.text, 32)).collect::<Vec<Cow<'_,str>>>(),
+            prefix_id=?request.get_ref().prefix_id,
             correlation_id=?request.metadata().get("x-correlation-id").map(|mv| mv.to_str().unwrap_or("<non-ascii>")).unwrap_or("<none>"),
             input_bytes=?request.get_ref().requests.iter().map(|r| r.text.len()).collect::<Vec<usize>>(),
             params=?request.get_ref().params,
@@ -171,6 +172,7 @@ impl GenerationService for GenerationServicer {
         skip_all,
         fields(
             input=?truncate(&request.get_ref().request.as_ref().map(|r| &*r.text).unwrap_or(""), 32),
+            prefix_id=?request.get_ref().prefix_id,
             correlation_id=?request.metadata().get("x-correlation-id").map(|mv| mv.to_str().unwrap_or("<non-ascii>")).unwrap_or("<none>"),
             input_bytes=?request.get_ref().request.as_ref().map(|r| r.text.len()).unwrap_or(0),
             params=?request.get_ref().params,
diff --git a/server/text_generation_server/prompt_cache.py b/server/text_generation_server/prompt_cache.py
@@ -45,34 +45,34 @@ def __init__(self,
         ) -> None:
         self.prefix_id = prefix_id
         self.prompt = prompt
-        self.prompt_size_mb = PromptCacheNode._get_prompt_size_mb(prompt)
+        self.prompt_virtual_tokens, self.prompt_size_mb = PromptCacheNode._get_prompt_stats(prompt)
         self.next = next
         self.prev = prev
 
     @staticmethod
-    def _get_prompt_size_mb(prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]) -> int:
-        """Get the memory size of a prompt. Note that we round up to the nearest
+    def _get_prompt_stats(prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[int, int]:
+        """Get the number of virtual tokens and memory size of a prompt. Note that we round up to the nearest
         increment of 512.
 
         Args:
             prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
                 Prompt tuple/tensor we want to take the size of.
         
         Return:
-            Prompt size in Mb.
+            (prompt virtual token count, prompt size in MiB)
         """
         # In some cases, we may have None, e.g., an encoder / decoder
         # where we don't have prompts to inject for both components
         if prompt is None:
-            return 0
+            return 0, 0
         # We either have a Tensor or an iterable of tensors; if it's not
         # a tensor, take the size of all contained tensor objects.
         elif not isinstance(prompt, torch.Tensor):
-            return sum(map(PromptCacheNode._get_prompt_size_mb, prompt))
+            return tuple(sum(x) for x in zip(*map(PromptCacheNode._get_prompt_stats, prompt)))
         # Otherwise it's a tensor; round up to nearest 512 increment & convert to mb.
         # See: https://discuss.pytorch.org/t/how-to-know-the-memory-allocated-for-a-tensor-on-gpu/28537/15
         raw_size = prompt.element_size() * prompt.nelement()
-        return (math.ceil(raw_size / 512) * 512) / (1024 ** 2)
+        return prompt.shape[0], (math.ceil(raw_size / 512) * 512) / (1024 ** 2)
 
 
 class DoublyLinkedList:
@@ -299,6 +299,8 @@ def _add_prefix_id_to_cache(
         new_cache_node = PromptCacheNode(prompt=prefix, prefix_id=prefix_id)
         del_tensors = {}
 
+        new_prompt_virtual_tokens = new_cache_node.prompt_virtual_tokens
+        new_prompt_size_mb = new_cache_node.prompt_size_mb
         with self.requires_lock:
             # If we already have it, return the node in the cache.
             # This will release the tensor we just loaded.
@@ -309,7 +311,7 @@ def _add_prefix_id_to_cache(
             if new_cache_node.prompt_size_mb > PROMPT_CACHE_SIZE_MB:
                 raise ValueError(f"Prefix ID object {prefix_id} exceeds the allowed cache size")
 
-            while self.cache_size_mb + new_cache_node.prompt_size_mb > PROMPT_CACHE_SIZE_MB:
+            while self.cache_size_mb + new_prompt_size_mb > PROMPT_CACHE_SIZE_MB:
                 # Hold a reference to the set of things to be deallocated until we're out of the
                 # critical section; then, we can handle the cache keys in a thread safe way
                 # without deallocating our tensors in it.
@@ -322,10 +324,15 @@ def _add_prefix_id_to_cache(
                 self.cache_size_mb -= del_node.prompt_size_mb
             self.cache_dll.add_node_as_head(new_cache_node)
             self.cache_map[prefix_id] = new_cache_node
-            self.cache_size_mb += new_cache_node.prompt_size_mb
-        logger.info(f"Added prefix {prefix_id} to the prompt cache")
+            self.cache_size_mb += new_prompt_size_mb
+            cache_size_mb = self.cache_size_mb
         if del_tensors:
             logger.info(f"Deleted prefixes {list(del_tensors.keys())} from the prompt cache")
+
+        logger.info(
+            f"Added prefix {prefix_id} to the prompt cache, has {new_prompt_virtual_tokens} virtual tokens"
+            f", size {new_prompt_size_mb:.3f}MiB, total cache size is now {cache_size_mb:.2f}MiB"
+        )
         return new_cache_node
 
     def _get_from_cache(self, prefix_id: str) -> PromptCacheNode: