Log number of KVCacheManager blocks at init (#87)

tdoublep · web-flow · commit f091ad5f75dd · 2024-05-07T21:27:17.000+02:00
#### Motivation Users are encountering problems running out of blocks on GPUs with less than 80GB memory. #### Modifications This PR simply adds a print out of the number of free blocks at start-up time. #### Result This will help us debug the issue with users, e.g., we could suggest to them to change the environment variable `KV_CACHE_MANAGER_NUM_GPU_BLOCKS` to manually increase the number of blocks, but we need to first know what they are starting from. #### Related Issues https://huggingface.co/ibm-fms/granite-7b-lab-accelerator/discussions/1 Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
diff --git a/server/text_generation_server/models/paged_causal_lm.py b/server/text_generation_server/models/paged_causal_lm.py
@@ -333,6 +333,9 @@ def __init__(
             total_num_gpu_blocks=total_num_gpu_blocks,
         )
 
+        # log number of free blocks at init
+        print("[PagedKVCacheManager] number of free blocks: %d" % (len(self.kv_cache_manager.free_blocks)))
+
     @property
     def batch_type(self) -> Type[PagedCausalLMBatch]:
         return self._batch_type

Original file line number	Diff line number	Diff line change
`@@ -333,6 +333,9 @@ def __init__(`
`333`	`333`	`total_num_gpu_blocks=total_num_gpu_blocks,`
`334`	`334`	`)`
`335`	`335`
	`336`	`+ # log number of free blocks at init`
	`337`	`+ print("[PagedKVCacheManager] number of free blocks: %d" % (len(self.kv_cache_manager.free_blocks)))`
	`338`	`+`
`336`	`339`	`@property`
`337`	`340`	`def batch_type(self) -> Type[PagedCausalLMBatch]:`
`338`	`341`	`return self._batch_type`