Skip to content

Commit 41cd3e2

Browse files
[Feature] Enable prefix caching as default (#3816)
* [Feature] Enable prefix caching as default * [Feature] Enable prefix caching as default * Set prefix caching as default * skip dynamic load * fix kill bug * fix kill bug * fix kill bug * fix ci * fix --------- Co-authored-by: Jiang-Jia-Jun <[email protected]>
1 parent 11b18e5 commit 41cd3e2

File tree

6 files changed

+37
-5
lines changed

6 files changed

+37
-5
lines changed

fastdeploy/engine/args_utils.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
"""
1616

17+
import argparse
1718
import json
1819
from dataclasses import asdict, dataclass
1920
from dataclasses import fields as dataclass_fields
@@ -190,7 +191,7 @@ class EngineArgs:
190191
"""
191192
Flag to indicate whether to use warm-up before inference.
192193
"""
193-
enable_prefix_caching: bool = False
194+
enable_prefix_caching: bool = True
194195
"""
195196
Flag to enable prefix caching.
196197
"""
@@ -387,6 +388,16 @@ def __post_init__(self):
387388
"""
388389
if not self.tokenizer:
389390
self.tokenizer = self.model
391+
if self.splitwise_role == "decode":
392+
self.enable_prefix_caching = False
393+
if self.speculative_config is not None:
394+
self.enable_prefix_caching = False
395+
if self.enable_mm:
396+
self.enable_prefix_caching = False
397+
if not current_platform.is_cuda():
398+
self.enable_prefix_caching = False
399+
if self.dynamic_load_weight:
400+
self.enable_prefix_caching = False
390401
if self.enable_logprob:
391402
if self.speculative_config is not None:
392403
raise NotImplementedError("Logprob does not support speculation_config.")
@@ -725,7 +736,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
725736
perf_group = parser.add_argument_group("Performance Tuning")
726737
perf_group.add_argument(
727738
"--enable-prefix-caching",
728-
action="store_true",
739+
action=argparse.BooleanOptionalAction,
729740
default=EngineArgs.enable_prefix_caching,
730741
help="Flag to enable prefix caching.",
731742
)

fastdeploy/engine/engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,8 @@ def _exit_sub_services(self):
342342
for p in self.cache_manager_processes:
343343
llm_logger.info(f"Killing cache manager process {p.pid}")
344344
try:
345-
os.killpg(p.pid, signal.SIGTERM)
345+
pgid = os.getpgid(p.pid)
346+
os.killpg(pgid, signal.SIGTERM)
346347
except Exception as e:
347348
console_logger.error(
348349
f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}"

fastdeploy/worker/gpu_model_runner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
221221
req_len = len(req_dicts)
222222
has_prefill_task = False
223223
has_decode_task = False
224+
has_preempted_task = False
224225
for i in range(req_len):
225226
request = req_dicts[i]
226227
idx = request.idx
@@ -320,6 +321,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
320321
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
321322
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
322323
self.share_inputs["is_block_step"][idx : idx + 1] = False
324+
has_preempted_task = True
323325
continue
324326

325327
assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
@@ -375,6 +377,10 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
375377

376378
if has_prefill_task or has_decode_task:
377379
self.share_inputs["not_need_stop"][0] = True
380+
if has_preempted_task:
381+
self.share_inputs["not_need_stop"][0] = not (
382+
self.share_inputs["stop_flags"].sum() == self.parallel_config.max_num_seqs
383+
)
378384
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
379385

380386
def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: int = None):

scripts/coverage_run.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ for file in $TEST_FILES; do
3232
else
3333
success_pytest=$((success_pytest+1))
3434
fi
35+
ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9
3536
done
3637

3738
##################################

scripts/run_pre_ce.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ for subdir in "$run_path"*/; do
2727
timeout 600 python -m pytest --disable-warnings -sv "$file"
2828
exit_code=$?
2929
set -e
30-
30+
ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9
3131
if [ $exit_code -ne 0 ]; then
3232
if [ -f "${subdir%/}/log/workerlog.0" ]; then
3333
echo "---------------- log/workerlog.0 -------------------"

tests/ce/deploy/deploy.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,19 @@ def stop_server(signum=None, frame=None):
181181
except Exception as e:
182182
print(f"Failed to stop server: {e}, {str(traceback.format_exc())}")
183183

184+
try:
185+
result = subprocess.run(
186+
f"ps -ef -ww | grep {FD_CACHE_QUEUE_PORT} | grep -v grep", shell=True, capture_output=True, text=True
187+
)
188+
for line in result.stdout.strip().split("\n"):
189+
if not line:
190+
continue
191+
parts = line.split()
192+
pid = int(parts[1]) # ps -ef 的第二列是 PID
193+
print(f"Killing PID: {pid}")
194+
os.kill(pid, signal.SIGKILL)
195+
except Exception as e:
196+
print(f"Failed to kill cache manager process: {e}, {str(traceback.format_exc())}")
184197
for port in [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT]:
185198
try:
186199
output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
@@ -285,7 +298,7 @@ def start_service():
285298
def switch_service():
286299
"""切换模型服务"""
287300
# kill掉已有服务
288-
stop_server()
301+
res, status_code = stop_server()
289302
time.sleep(2)
290303

291304
try:

0 commit comments

Comments
 (0)