@@ -1165,3 +1165,80 @@ def test_kv_connector_handles_preemption():
11651165 # All memory should be freed since nothing is running.
11661166 assert scheduler .kv_cache_manager .block_pool .get_num_free_blocks () \
11671167 == NUM_BLOCKS - 1
1168+
1169+
1170+ def make_output (scheduler : Scheduler ):
1171+ return ModelRunnerOutput (
1172+ req_ids = [req .request_id for req in scheduler .running ],
1173+ req_id_to_index = {
1174+ req .request_id : i
1175+ for i , req in enumerate (scheduler .running )
1176+ },
1177+ sampled_token_ids = [[1000 ]] * len (scheduler .running ),
1178+ spec_token_ids = None ,
1179+ logprobs = None ,
1180+ prompt_logprobs_dict = {},
1181+ )
1182+
1183+
1184+ def assert_scheduler_empty (scheduler : Scheduler ):
1185+ """Confirm the scheduler is "empty" - i.e. no leaks."""
1186+ # Scheduler Metadata.
1187+ assert len (scheduler .requests ) == 0
1188+ assert len (scheduler .waiting ) == 0
1189+ assert len (scheduler .running ) == 0
1190+ assert len (scheduler .finished_req_ids ) == 0
1191+ assert len (scheduler ._cached_reqs_data ) == 0
1192+
1193+ # EncoderCacheManager.
1194+ assert len (scheduler .encoder_cache_manager .freed ) == 0
1195+ assert len (scheduler .encoder_cache_manager .cached ) == 0
1196+
1197+ # KVCache Manager.
1198+ assert len (scheduler .kv_cache_manager .req_to_blocks ) == 0
1199+ assert len (scheduler .kv_cache_manager .req_to_block_hashes ) == 0
1200+ assert len (scheduler .kv_cache_manager .num_cached_block ) == 0
1201+ num_free_blocks = (
1202+ scheduler .kv_cache_manager .block_pool .free_block_queue .num_free_blocks )
1203+ assert num_free_blocks == (
1204+ scheduler .kv_cache_manager .block_pool .num_gpu_blocks - 1 )
1205+
1206+ # NOTE(rob): just the ref count on blocks will be 0. The hash
1207+ # value, etc will remain since we lazily evict for prefix cache.
1208+ for block in scheduler .kv_cache_manager .block_pool .blocks :
1209+ assert block .ref_cnt == 0
1210+ # assert block._block_hash is None
1211+ # assert (
1212+ # len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
1213+ # ) == 0)
1214+
1215+
1216+ def test_memory_leak ():
1217+ """Test that we do not have a memory leak."""
1218+
1219+ scheduler = create_scheduler (enable_prefix_caching = True )
1220+
1221+ NUM_REQUESTS = 5
1222+ NUM_TOKENS = 10
1223+ MAX_TOKENS = 10
1224+ requests = create_requests (num_requests = NUM_REQUESTS ,
1225+ num_tokens = NUM_TOKENS ,
1226+ max_tokens = MAX_TOKENS )
1227+
1228+ # Add each request.
1229+ for request in requests :
1230+ scheduler .add_request (request )
1231+ scheduler_output = scheduler .schedule ()
1232+ model_runner_output = make_output (scheduler )
1233+ scheduler .update_from_output (scheduler_output , model_runner_output )
1234+
1235+ # Iterate until done.
1236+ while True :
1237+ scheduler_output = scheduler .schedule ()
1238+ if len (scheduler .running ) == 0 :
1239+ break
1240+ model_runner_output = make_output (scheduler )
1241+ scheduler .update_from_output (scheduler_output , model_runner_output )
1242+
1243+ # Confirm no memory leak.
1244+ assert_scheduler_empty (scheduler )
0 commit comments