|
34 | 34 | from ..speculative.drafter import Drafter |
35 | 35 | from .kv_cache_transceiver import KvCacheTransceiver |
36 | 36 | from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState, |
37 | | - LlmResponse, LlmResult, executor_request_to_llm_request) |
| 37 | + LlmResponse, LlmResult, executor_request_to_llm_request, PyResult) |
38 | 38 | from .model_engine import ModelEngine |
39 | 39 | from .sampler import Sampler, SampleState, SampleStateTensors, TorchSampler |
40 | 40 | from .scheduler import RequestScheduler, ScheduledRequests |
@@ -1086,11 +1086,11 @@ def _prepare_draft_requests(self): |
1086 | 1086 |
|
1087 | 1087 | def _sleep(self, sleep_request): |
1088 | 1088 | self.is_sleep_request = False |
1089 | | - self._enqueue_responses({sleep_request.id: LlmResponse(request_id=sleep_request.id, result=LlmResult(result=None, py_result=None, is_final=True), client_id=sleep_request.id)}) |
| 1089 | + self._enqueue_responses({sleep_request.id: LlmResponse(request_id=sleep_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=True), is_final=True), client_id=sleep_request.id)}) |
1090 | 1090 |
|
1091 | 1091 | def _wakeup(self, wakeup_request): |
1092 | 1092 | self.is_wakeup_request = False |
1093 | | - self._enqueue_responses({wakeup_request.id: LlmResponse(request_id=wakeup_request.id, result=LlmResult(result=None, py_result=None, is_final=True), client_id=wakeup_request.id)}) |
| 1093 | + self._enqueue_responses({wakeup_request.id: LlmResponse(request_id=wakeup_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=True), is_final=True), client_id=wakeup_request.id)}) |
1094 | 1094 |
|
1095 | 1095 | def _update_weight(self, update_weight_request): |
1096 | 1096 | self.is_update_weight_request = False |
@@ -1119,13 +1119,13 @@ def _update_weight(self, update_weight_request): |
1119 | 1119 | self.model_engine.model.load_weights(weights) |
1120 | 1120 |
|
1121 | 1121 | torch.cuda.synchronize() |
1122 | | - update_weight_response = LlmResponse(request_id=update_weight_request.id, result=LlmResult(result=None, py_result=None, is_final=True), client_id=update_weight_request.id) |
| 1122 | + update_weight_response = LlmResponse(request_id=update_weight_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=True), is_final=True), client_id=update_weight_request.id) |
1123 | 1123 | self._enqueue_responses({update_weight_request.id: update_weight_response}) |
1124 | 1124 | except Exception as e: |
1125 | 1125 | print( |
1126 | 1126 | f"Error in VllmInternalWorkerExtension.update_weights_from_ipc_handles: {e}" |
1127 | 1127 | ) |
1128 | | - update_weight_response = LlmResponse(request_id=update_weight_request.id, result=LlmResult(result=None, py_result=None, is_final=True), client_id=update_weight_request.id) |
| 1128 | + update_weight_response = LlmResponse(request_id=update_weight_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=False), is_final=True), client_id=update_weight_request.id) |
1129 | 1129 | self._enqueue_responses({update_weight_request.id: update_weight_response}) |
1130 | 1130 |
|
1131 | 1131 | def _executor_loop_overlap(self): |
|
0 commit comments