Skip to content

Commit 3096dd1

Browse files
yinggehmc-nv
authored andcommitted
ci: L0_openai_vllm--base (#110)
1 parent 710bdd4 commit 3096dd1

File tree

3 files changed

+48
-26
lines changed

3 files changed

+48
-26
lines changed

ci/L0_multi_gpu_vllm/multi_lora/test.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,6 @@ check_response() {
9797
}
9898

9999
# first we download weights
100-
pip install -U huggingface_hub
101-
102100
rm -rf weights && mkdir -p weights/loras/GemmaDoll && mkdir -p weights/loras/GemmaSheep
103101
mkdir -p weights/backbone/gemma-2b
104102

@@ -119,7 +117,6 @@ model_json=$(cat <<EOF
119117
"enforce_eager": true,
120118
"enable_lora": true,
121119
"max_lora_rank": 32,
122-
"lora_extra_vocab_size": 256,
123120
"distributed_executor_backend":"ray"
124121
}
125122
EOF
@@ -210,7 +207,6 @@ model_json=$(cat <<EOF
210207
"enforce_eager": true,
211208
"enable_lora": "true",
212209
"max_lora_rank": 32,
213-
"lora_extra_vocab_size": 256,
214210
"distributed_executor_backend":"ray"
215211
}
216212
EOF
@@ -288,7 +284,6 @@ model_json=$(cat <<EOF
288284
"block_size": 16,
289285
"enforce_eager": true,
290286
"enable_lora": false,
291-
"lora_extra_vocab_size": 256,
292287
"distributed_executor_backend":"ray"
293288
}
294289
EOF
@@ -349,7 +344,6 @@ model_json=$(cat <<EOF
349344
"block_size": 16,
350345
"enforce_eager": true,
351346
"enable_lora": "false",
352-
"lora_extra_vocab_size": 256,
353347
"distributed_executor_backend":"ray"
354348
}
355349
EOF

src/model.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import os
3131
import queue
3232
import threading
33+
import traceback
3334
from typing import Dict, List
3435

3536
import numpy as np
@@ -244,7 +245,9 @@ def _init_engine(self):
244245
# failed to start, so the exception is passed back via the engine variable.
245246
if isinstance(self._llm_engine, Exception):
246247
e = self._llm_engine
247-
self.logger.log_error(f"[vllm] Failed to start engine: {e}")
248+
self.logger.log_error(
249+
f"[vllm] Failed to start engine: {traceback.format_exc()}"
250+
)
248251
if self._event_thread is not None:
249252
self._event_thread.join()
250253
self._event_thread = None
@@ -349,7 +352,6 @@ def _setup_lora(self):
349352
lora_repository: Dict[str, str] = json.load(lora_file)
350353
self.lora_repository = lora_repository
351354
self.supported_loras: List[str] = list(self.lora_repository.keys())
352-
self.supported_loras_len = len(self.supported_loras)
353355
self.enable_lora = True
354356
except FileNotFoundError:
355357
raise FileNotFoundError(
@@ -398,7 +400,7 @@ def _response_loop(self):
398400
response_state["is_cancelled"] = response_sender.is_cancelled()
399401
except Exception as e:
400402
self.logger.log_error(
401-
f"An error occurred while sending a response: {e}"
403+
f"An error occurred while sending a response: {traceback.format_exc()}"
402404
)
403405
finally:
404406
if response_flag == pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL:
@@ -458,9 +460,22 @@ async def _infer(self, request):
458460
try:
459461
request_task_name = self._validate_request_task_name(request)
460462
if request_task_name == "generate":
461-
request = GenerateRequest(
462-
request, self._llm_engine.generate, self.output_dtype, self.logger
463-
)
463+
if self.enable_lora:
464+
request = GenerateRequest(
465+
request,
466+
self._llm_engine.generate,
467+
self.output_dtype,
468+
self.logger,
469+
self.lora_repository,
470+
self.supported_loras,
471+
)
472+
else:
473+
request = GenerateRequest(
474+
request,
475+
self._llm_engine.generate,
476+
self.output_dtype,
477+
self.logger,
478+
)
464479
elif request_task_name == "embed":
465480
request = EmbedRequest(
466481
request, self._llm_engine.encode, self.output_dtype, self.logger
@@ -533,7 +548,9 @@ async def _infer(self, request):
533548
)
534549

535550
except Exception as e:
536-
self.logger.log_error(f"[vllm] Error generating stream: {e}")
551+
self.logger.log_error(
552+
f"[vllm] Error generating stream: {traceback.format_exc()}"
553+
)
537554
error = pb_utils.TritonError(f"Error generating stream: {e}")
538555
text_output_tensor = pb_utils.Tensor(
539556
"text_output", np.asarray(["N/A"], dtype=self.output_dtype)
@@ -591,7 +608,7 @@ def _check_health(self, requests):
591608
future.result()
592609
except Exception as e:
593610
self.logger.log_error(
594-
f"[vllm] Engine is not healthy and model will be unloaded: {e}"
611+
f"[vllm] Engine is not healthy and model will be unloaded: {traceback.format_exc()}"
595612
)
596613
pb_utils.unload_model(self.model_config["name"]) # non-blocking
597614
self._is_healthy = False

src/utils/request.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import json
2929
from abc import abstractmethod
3030
from io import BytesIO
31-
from typing import Callable
31+
from typing import Callable, Dict, List, Optional
3232

3333
import numpy as np
3434
import triton_python_backend_utils as pb_utils
@@ -51,7 +51,7 @@ class RequestBase:
5151
def __init__(
5252
self, request, executor_callback: Callable, output_dtype: np.dtype, logger
5353
):
54-
self.request = request
54+
self.triton_request = request
5555
self.executor_callback = executor_callback
5656
self.output_dtype = output_dtype
5757
self.logger = logger
@@ -74,20 +74,31 @@ def create_response(self, request_output, *args, **kwargs):
7474

7575
class GenerateRequest(RequestBase):
7676
def __init__(
77-
self, request, executor_callback: Callable, output_dtype: np.dtype, logger
77+
self,
78+
request,
79+
executor_callback: Callable,
80+
output_dtype: np.dtype,
81+
logger,
82+
lora_repository: Optional[Dict[str, str]] = None,
83+
supported_loras: Optional[List[str]] = None,
7884
):
7985
super().__init__(request, executor_callback, output_dtype, logger)
86+
# Attributes for generate requests
87+
if lora_repository is not None:
88+
self.lora_repository = lora_repository
89+
if supported_loras is not None:
90+
self.supported_loras = supported_loras
8091

8192
def _get_input_tensors(self):
8293
# prompt
8394
prompt = pb_utils.get_input_tensor_by_name(
84-
self.request, "text_input"
95+
self.triton_request, "text_input"
8596
).as_numpy()[0]
8697
if isinstance(prompt, bytes):
8798
prompt = prompt.decode("utf-8")
8899

89100
# image
90-
images = pb_utils.get_input_tensor_by_name(self.request, "image")
101+
images = pb_utils.get_input_tensor_by_name(self.triton_request, "image")
91102
if images:
92103
images_vllm = []
93104
for image_np in images.as_numpy():
@@ -101,15 +112,15 @@ def _get_input_tensors(self):
101112
}
102113

103114
# stream
104-
stream = pb_utils.get_input_tensor_by_name(self.request, "stream")
115+
stream = pb_utils.get_input_tensor_by_name(self.triton_request, "stream")
105116
if stream:
106117
stream = stream.as_numpy()[0]
107118
else:
108119
stream = False
109120

110121
# prepend_input / exclude_input_in_output
111122
prepend_input = pb_utils.get_input_tensor_by_name(
112-
self.request, "exclude_input_in_output"
123+
self.triton_request, "exclude_input_in_output"
113124
)
114125
if prepend_input:
115126
# When `exclude_input_in_output` is False, we want to prepend input prompt
@@ -128,12 +139,12 @@ def _get_input_tensors(self):
128139
# An alternative mechanism to receive serialized parameters as an input
129140
# tensor, because request parameters are not yet supported via BLS.
130141
sampling_parameters = pb_utils.get_input_tensor_by_name(
131-
self.request, "sampling_parameters"
142+
self.triton_request, "sampling_parameters"
132143
)
133144
if sampling_parameters:
134145
parameters = sampling_parameters.as_numpy()[0].decode("utf-8")
135146
else:
136-
parameters = self.request.parameters()
147+
parameters = self.triton_request.parameters()
137148

138149
# additional outputs
139150
additional_outputs = {
@@ -144,7 +155,7 @@ def _get_input_tensors(self):
144155
"return_num_output_tokens": None,
145156
}
146157
for tensor_name in additional_outputs.keys():
147-
tensor = pb_utils.get_input_tensor_by_name(self.request, tensor_name)
158+
tensor = pb_utils.get_input_tensor_by_name(self.triton_request, tensor_name)
148159
if tensor:
149160
tensor = bool(tensor.as_numpy()[0])
150161
else:
@@ -302,7 +313,7 @@ def __init__(
302313

303314
def _get_input_tensors(self):
304315
embedding_request = pb_utils.get_input_tensor_by_name(
305-
self.request, "embedding_request"
316+
self.triton_request, "embedding_request"
306317
).as_numpy()[0]
307318
embedding_request = json.loads(embedding_request.decode("utf-8"))
308319
# prompt
@@ -324,7 +335,7 @@ def _get_input_tensors(self):
324335
"return_num_output_tokens": None,
325336
}
326337
for tensor_name in additional_outputs.keys():
327-
tensor = pb_utils.get_input_tensor_by_name(self.request, tensor_name)
338+
tensor = pb_utils.get_input_tensor_by_name(self.triton_request, tensor_name)
328339
if tensor:
329340
tensor = bool(tensor.as_numpy()[0])
330341
else:

0 commit comments

Comments
 (0)