[PD] Fix of requests occasionally missing issue in async transfer (SW-234952) (#1978)

hlin99 · web-flow · commit 10916b262b24 · 2025-09-29T13:08:07.000+08:00
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -374,12 +374,12 @@ def sample_random_requests(
                                          size=prefix_len).tolist()
 
     input_lens = np.random.randint(
-        int(input_len * range_ratio),
+        max(1, int(input_len * range_ratio)), # At least 1 input token
         input_len + 1,
         size=num_prompts,
     )
     output_lens = np.random.randint(
-        int(output_len * range_ratio),
+        max(1, int(output_len * range_ratio)), # At least 1 output token
         output_len + 1,
         size=num_prompts,
     )
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -581,13 +581,18 @@ def put_to_shared_dict(prefix, kv_cache, hidden_states):
 
             self.scheduler_profiler.start('internal', 'fetching_kv')
             hash_prefix = hash_list(seq_group.prompt_token_ids)
-            prefix, kv_cache, hidden_states = get_kv_and_hidden_states(
-                hash_prefix)
-            if kv_cache is not None:
+            if len(seq_group.prompt_token_ids) == 1:
+                # This is a padding seq. Won't be able to fetch KV. skip it.
+                logger.info("seq len is 1, skip fetching kv...")
                 fetching_success = True
-                put_to_shared_dict(prefix, kv_cache, hidden_states)
             else:
-                fetching_success = False
+                prefix, kv_cache, hidden_states = get_kv_and_hidden_states(
+                    hash_prefix)
+                if kv_cache is not None:
+                    fetching_success = True
+                    put_to_shared_dict(prefix, kv_cache, hidden_states)
+                else:
+                    fetching_success = False
             self.fetching_done.put((seq_group, fetching_success))
             self.fetching_queue.task_done()
             self.scheduler_profiler.end()
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -2946,14 +2946,21 @@ def async_recv_kv_caches(model, model_input, attn_metadata,
                             model_input.attn_metadata.seq_lens_tensor
                         seq_lens = seq_lens_tensor.tolist()  #2D list
                         hidden_states_list = []
+                        HIDDEN_SHAPE = (1, 7168)
                         start_block_idx = 0
                         k_v_head_size = 576
                         bypass_model_exec = True
                         htorch.core.mark_step()
                         for idx, slen in enumerate(seq_lens):
                             if slen == 1:
-                                hidden_states_list.append(
-                                    hidden_states_list[0])
+                                if hidden_states_list:
+                                    hidden_states_list.append(
+                                        hidden_states_list[0])
+                                else:
+                                    logger.warning("The first seq len is 1")
+                                    dummy_hidden = torch.zeros(HIDDEN_SHAPE, device="hpu")
+                                    hidden_states_list.append(
+                                        dummy_hidden)
                                 # skip the seq with only one token
                                 continue
                             num_blocks = (slen + self.block_size -

Original file line number	Diff line number	Diff line change
`@@ -374,12 +374,12 @@ def sample_random_requests(`
`374`	`374`	`size=prefix_len).tolist()`
`375`	`375`
`376`	`376`	`input_lens = np.random.randint(`
`377`		`- int(input_len * range_ratio),`
	`377`	`+ max(1, int(input_len * range_ratio)), # At least 1 input token`
`378`	`378`	`input_len + 1,`
`379`	`379`	`size=num_prompts,`
`380`	`380`	`)`
`381`	`381`	`output_lens = np.random.randint(`
`382`		`- int(output_len * range_ratio),`
	`382`	`+ max(1, int(output_len * range_ratio)), # At least 1 output token`
`383`	`383`	`output_len + 1,`
`384`	`384`	`size=num_prompts,`
`385`	`385`	`)`