[Fix] fix some error

Csrayz · Csrayz · commit 8e0adbe7235e · 2025-08-14T10:22:30.000+08:00
Signed-off-by: Csrayz &lt;jover@cmbchina.com&gt;
diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
@@ -54,7 +54,7 @@ The details of each config option are as follows:
 | ---- | ---- | ------- | ----------- |
 | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
 | `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
-| `long_prefill_token_threshold` | Union[int, float] | `False` | a request is considered long if the prompt is longer than this number of tokens. |
+| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
 
 ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
 
diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py
@@ -66,7 +66,7 @@ def __post_init__(self) -> None:
         else:
             if self.long_prefill_token_threshold is None:
                 self.long_prefill_token_threshold = \
-                    int(self.max_model_len * 0.04)
+                    max(1, int(self.max_model_len * 0.04))
 
         assert (self.max_long_partial_prefills > 0)
         assert (self.long_prefill_token_threshold > 0)
diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
@@ -78,6 +78,7 @@ def schedule(self) -> SchedulerOutput:
         # Skip long prompt requests in prefill stage.
         # long_prefill_budget is float('inf') if not use.
         long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
+        long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold
 
         # Schedule prefill requests first.
         while self.waiting and token_budget > 0:
@@ -177,7 +178,7 @@ def skip_cur_request():
                 skip_cur_request()
                 continue
 
-            if  num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \
+            if  num_new_tokens > long_prefill_token_threshold \
                 and long_prefill_budget <= 0:
                 skip_cur_request()
                 continue
@@ -231,7 +232,8 @@ def skip_cur_request():
             # Update request info.
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
-            long_prefill_budget -= 1
+            if num_new_tokens > long_prefill_token_threshold:
+                long_prefill_budget -= 1
             request.status = RequestStatus.RUNNING
             request.num_computed_tokens = num_computed_tokens
             # Count the number of prefix cached tokens.