From 410fe43aa6269820188baf295ec17913e2c3039d Mon Sep 17 00:00:00 2001 From: Csrayz Date: Wed, 13 Aug 2025 22:00:36 +0800 Subject: [PATCH 1/4] [FEAT] add Concurrent Partial Prefills Implement Concurrent Partial Prefills Signed-off-by: Csrayz --- vllm_ascend/core/schedule_config.py | 17 ++++++++++++++++- vllm_ascend/core/scheduler.py | 10 ++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 4a4131ecd7..5c001b0fec 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -16,7 +16,7 @@ # from dataclasses import dataclass, fields -from typing import Type, Union +from typing import Optional, Type, Union from vllm.config import SchedulerConfig @@ -24,6 +24,8 @@ @dataclass class AscendSchedulerConfig(SchedulerConfig): enable_chunked_prefill: bool = False + max_long_partial_prefills: Optional[Union[int, float]] = None + long_prefill_token_threshold: Optional[Union[int, float]] = None policy: str = "fcfs" num_scheduler_steps: int = 1 scheduler_cls: Union[str, Type[object]] = ( @@ -41,6 +43,8 @@ def initialize_from_config( } # Override default values into original SchedulerConfig scheduler_config["enable_chunked_prefill"] = False + scheduler_config["max_long_partial_prefills"] = None + scheduler_config["long_prefill_token_threshold"] = None scheduler_config["policy"] = "fcfs" scheduler_config["num_scheduler_steps"] = 1 scheduler_config["scheduler_cls"] = ( @@ -55,6 +59,17 @@ def __post_init__(self) -> None: self.max_num_encoder_input_tokens = self.max_num_batched_tokens self.encoder_cache_size = self.max_num_batched_tokens self.chunked_prefill_enabled = self.enable_chunked_prefill + # concurrent partial prefills. Default is inf + if self.max_long_partial_prefills is None: + self.max_long_partial_prefills = float('inf') + self.long_prefill_token_threshold = float('inf') + else: + if self.long_prefill_token_threshold is None: + self.long_prefill_token_threshold = \ + int(self.max_model_len * 0.04) + + assert (self.max_long_partial_prefills > 0) + assert (self.long_prefill_token_threshold > 0) if self.policy != "fcfs": raise NotImplementedError( f"currently AscendScheduler only supports fcfs policy, got {self.policy}" diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index dfdc9aa863..24d6060765 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -75,6 +75,10 @@ def schedule(self) -> SchedulerOutput: # and put back at the head of the waiting queue later skipped_waiting_requests: deque[Request] = deque() + # Skip long prompt requests in prefill stage. + # long_prefill_budget is float('inf') if not use. + long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills + # Schedule prefill requests first. while self.waiting and token_budget > 0: if len(self.running) == self.max_num_running_reqs: @@ -173,6 +177,11 @@ def skip_cur_request(): skip_cur_request() continue + if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \ + and long_prefill_budget <= 0: + skip_cur_request() + continue + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, @@ -222,6 +231,7 @@ def skip_cur_request(): # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens + long_prefill_budget -= 1 request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens # Count the number of prefix cached tokens. From 3662c439fe57549094304e65c712de4ac7f84501 Mon Sep 17 00:00:00 2001 From: Csrayz Date: Thu, 14 Aug 2025 00:00:07 +0800 Subject: [PATCH 2/4] [Doc] new ascend_scheduler_config Signed-off-by: Csrayz --- docs/source/user_guide/configuration/additional_config.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index df01430df1..f1f004c24f 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -53,6 +53,8 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine| +| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. | +| `long_prefill_token_threshold` | Union[int, float] | `False` | a request is considered long if the prompt is longer than this number of tokens. | ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. @@ -73,6 +75,8 @@ An example of additional configuration is as follows: "ascend_scheduler_config": { "enabled": True, "enable_chunked_prefill": True, + "max_long_partial_prefills": 1, + "long_prefill_token_threshold": 4096, }, "refresh": False, } From 8e0adbe7235e5ea311b6fccb2e818f2ba625e25f Mon Sep 17 00:00:00 2001 From: Csrayz Date: Thu, 14 Aug 2025 00:07:09 +0800 Subject: [PATCH 3/4] [Fix] fix some error Signed-off-by: Csrayz --- docs/source/user_guide/configuration/additional_config.md | 2 +- vllm_ascend/core/schedule_config.py | 2 +- vllm_ascend/core/scheduler.py | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index f1f004c24f..1e3d46be7f 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -54,7 +54,7 @@ The details of each config option are as follows: | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine| | `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. | -| `long_prefill_token_threshold` | Union[int, float] | `False` | a request is considered long if the prompt is longer than this number of tokens. | +| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. | ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 5c001b0fec..5db466d03f 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -66,7 +66,7 @@ def __post_init__(self) -> None: else: if self.long_prefill_token_threshold is None: self.long_prefill_token_threshold = \ - int(self.max_model_len * 0.04) + max(1, int(self.max_model_len * 0.04)) assert (self.max_long_partial_prefills > 0) assert (self.long_prefill_token_threshold > 0) diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 24d6060765..b8a84301c4 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -78,6 +78,7 @@ def schedule(self) -> SchedulerOutput: # Skip long prompt requests in prefill stage. # long_prefill_budget is float('inf') if not use. long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills + long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold # Schedule prefill requests first. while self.waiting and token_budget > 0: @@ -177,7 +178,7 @@ def skip_cur_request(): skip_cur_request() continue - if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \ + if num_new_tokens > long_prefill_token_threshold \ and long_prefill_budget <= 0: skip_cur_request() continue @@ -231,7 +232,8 @@ def skip_cur_request(): # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens - long_prefill_budget -= 1 + if num_new_tokens > long_prefill_token_threshold: + long_prefill_budget -= 1 request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens # Count the number of prefix cached tokens. From 55cafcbca77c39690b9f9955c94a7fa1a381f28d Mon Sep 17 00:00:00 2001 From: Csrayz Date: Thu, 14 Aug 2025 15:45:52 +0800 Subject: [PATCH 4/4] [FIX] assert -> raise ValueError Modify assert according to code review comments Signed-off-by: Csrayz --- vllm_ascend/core/schedule_config.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 5db466d03f..5cc53d933e 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -68,8 +68,15 @@ def __post_init__(self) -> None: self.long_prefill_token_threshold = \ max(1, int(self.max_model_len * 0.04)) - assert (self.max_long_partial_prefills > 0) - assert (self.long_prefill_token_threshold > 0) + if self.max_long_partial_prefills <= 0: + raise ValueError( + f"max_long_partial_prefills must be positive, but got " + f"{self.max_long_partial_prefills}") + if self.long_prefill_token_threshold <= 0: + raise ValueError( + f"long_prefill_token_threshold must be positive, but got " + f"{self.long_prefill_token_threshold}") + if self.policy != "fcfs": raise NotImplementedError( f"currently AscendScheduler only supports fcfs policy, got {self.policy}"