modify default values to fix bug

whx-sjtu · whx-sjtu · commit 86263c5839ea · 2025-10-24T10:17:30.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
@@ -67,8 +67,8 @@ The details of each config option are as follows:
 | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
 | `enable_pd_transfer` | bool | `False` | Whether to enable pd transfer. When using it, decode is started only when prefill of all requests is done. This option only takes effects on offline inference. |
 | `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when enable pd transfer. This option only takes effects when enable_pd_transfer is True. |
-| `max_long_partial_prefill_tokens` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
-| `ascend_long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
+| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
+| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
 
 ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
 
diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py
@@ -20,14 +20,13 @@
 
 from vllm.config import SchedulerConfig
 
-MAX_INT = 2147483647
 
 
 @dataclass
 class AscendSchedulerConfig(SchedulerConfig):
     enable_chunked_prefill: bool = False
-    max_long_partial_prefill_tokens: int = MAX_INT
-    ascend_long_prefill_token_threshold: int = MAX_INT
+    max_long_partial_prefills: int = 1
+    long_prefill_token_threshold: int = 0
     policy: str = "fcfs"
     scheduler_cls: Union[str, Type[object]] = (
         "vllm_ascend.core.scheduler.AscendScheduler")
@@ -71,27 +70,27 @@ def __post_init__(self) -> None:
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
                 "decrease max_model_len.")
-        # concurrent partial prefills. Default is inf
-        if self.max_long_partial_prefill_tokens is None:
-            self.max_long_partial_prefill_tokens = MAX_INT
-            self.ascend_long_prefill_token_threshold = MAX_INT
+        # concurrent partial prefills. Default is 1 meaning not enabled.
+        if self.max_long_partial_prefills is None:
+            self.max_long_partial_prefills = 1
+            self.long_prefill_token_threshold = 0
 
-        if self.ascend_long_prefill_token_threshold is None or \
-            self.ascend_long_prefill_token_threshold <= 0:
+        if self.long_prefill_token_threshold is None or \
+            self.long_prefill_token_threshold <= 0:
             if self.max_model_len is None:
-                self.ascend_long_prefill_token_threshold = MAX_INT
+                self.long_prefill_token_threshold = 0
             else:
-                self.ascend_long_prefill_token_threshold = \
+                self.long_prefill_token_threshold = \
                     max(1, int(self.max_model_len * 0.04))
 
-        if self.max_long_partial_prefill_tokens < 0:
+        if self.max_long_partial_prefills < 0:
             raise ValueError(
-                f"max_long_partial_prefill_tokens must be non-negative, but got "
-                f"{self.max_long_partial_prefill_tokens}")
-        if self.ascend_long_prefill_token_threshold < 0:
+                f"max_long_partial_prefills must be non-negative, but got "
+                f"{self.max_long_partial_prefills}")
+        if self.long_prefill_token_threshold < 0:
             raise ValueError(
-                f"ascend_long_prefill_token_threshold must be non-negative, but got "
-                f"{self.ascend_long_prefill_token_threshold}")
+                f"long_prefill_token_threshold must be non-negative, but got "
+                f"{self.long_prefill_token_threshold}")
 
         if self.policy != "fcfs":
             raise NotImplementedError(
diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
@@ -32,9 +32,6 @@
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.core.schedule_config import MAX_INT
-
 
 class AscendScheduler(Scheduler):
     """This Scheduler extends vllm's original v1 scheduler
@@ -63,7 +60,6 @@ def __init__(
         self.phase = "" if not enable_pd_transfer else "prefill"
         self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
                                                decode_max_num_seqs)
-        self.ascend_config = get_ascend_config()
 
     def schedule(self) -> SchedulerOutput:
         if self.scheduler_config.chunked_prefill_enabled:
@@ -108,12 +104,12 @@ def schedule(self) -> SchedulerOutput:
                 self.phase = "decode"
         # Skip long prompt requests in prefill stage.
         # long_prefill_budget is float('inf') if not use.
-        if self.ascend_config.ascend_scheduler_config.max_long_partial_prefill_tokens == MAX_INT:
+        if self.vllm_config.scheduler_config.long_prefill_token_threshold == 0:
             long_prefill_budget = float('inf')
             long_prefill_token_threshold = float('inf')
         else:
-            long_prefill_budget = self.ascend_config.ascend_scheduler_config.max_long_partial_prefill_tokens
-            long_prefill_token_threshold = self.ascend_config.ascend_scheduler_config.ascend_long_prefill_token_threshold
+            long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
+            long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold
 
         # Schedule prefill requests first.
         while self.waiting and token_budget > 0: