Skip to content

Commit a296189

Browse files
committed
initial generate support
1 parent 7c4fd6c commit a296189

File tree

5 files changed

+156
-13
lines changed

5 files changed

+156
-13
lines changed

src/c++/perf_analyzer/client_backend/openai/openai_client.cc

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ namespace openai {
6363
void
6464
ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
6565
{
66+
// if final response has already been sent
67+
// due to detecting the [DONE]
68+
// ignore final response due to request completion
69+
if (final_response_sent_) {
70+
return;
71+
}
72+
73+
final_response_sent_ = is_final;
6674
response_callback_(new ChatCompletionResult(
6775
http_code_, std::move(response_buffer_), is_final, is_null, request_id_));
6876
}
@@ -107,13 +115,15 @@ ChatCompletionClient::ResponseHeaderHandler(
107115
hdr.find("text/event-stream") != std::string::npos) {
108116
request->is_stream_ = true;
109117
}
118+
110119
return byte_size;
111120
}
112121

113122
size_t
114123
ChatCompletionClient::ResponseHandler(
115124
void* contents, size_t size, size_t nmemb, void* userp)
116125
{
126+
117127
// [TODO TMA-1666] verify if the SSE responses received are complete, or the
118128
// response need to be stitched first. To verify, print out the received
119129
// responses from SendResponse() to make sure the OpenAI server doesn't chunk
@@ -151,7 +161,7 @@ ChatCompletionClient::ResponseHandler(
151161
// RECV_END so that we always have the time of the last.
152162
request->timer_.CaptureTimestamp(
153163
triton::client::RequestTimers::Kind::RECV_END);
154-
164+
155165
return result_bytes;
156166
}
157167

@@ -162,6 +172,8 @@ ChatCompletionClient::AsyncInfer(
162172
std::string& serialized_request_body, const std::string& request_id,
163173
const Headers& headers)
164174
{
175+
176+
165177
if (callback == nullptr) {
166178
return Error(
167179
"Callback function must be provided along with AsyncInfer() call.");
@@ -172,9 +184,14 @@ ChatCompletionClient::AsyncInfer(
172184
request->timer_.CaptureTimestamp(
173185
triton::client::RequestTimers::Kind::REQUEST_END);
174186
UpdateInferStat(request->timer_);
175-
if (!request->is_stream_) {
176-
request->SendResponse(true /* is_final */, false /* is_null */);
177-
}
187+
188+
// Updated to be ok to call multiple times
189+
// will only send the first final response
190+
//
191+
// if (!request->is_stream_) {
192+
//
193+
request->SendResponse(true /* is_final */, false /* is_null */);
194+
// }
178195
};
179196
std::unique_ptr<HttpRequest> request(new ChatCompletionRequest(
180197
std::move(completion_callback), std::move(callback), request_id,
@@ -185,7 +202,7 @@ ChatCompletionClient::AsyncInfer(
185202
request->AddInput(
186203
reinterpret_cast<uint8_t*>(serialized_request_body.data()),
187204
serialized_request_body.size());
188-
205+
189206
CURL* multi_easy_handle = curl_easy_init();
190207
Error err = PreRunProcessing(multi_easy_handle, raw_request, headers);
191208
if (!err.IsOk()) {
@@ -226,7 +243,7 @@ ChatCompletionClient::PreRunProcessing(
226243

227244
// response data handled by ResponseHandler()
228245
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler);
229-
curl_easy_setopt(curl, CURLOPT_WRITEDATA, request);
246+
curl_easy_setopt(curl, CURLOPT_WRITEDATA, request);
230247

231248
const curl_off_t post_byte_size = request->total_input_byte_size_;
232249
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, post_byte_size);

src/c++/perf_analyzer/client_backend/openai/openai_client.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class ChatCompletionRequest : public HttpRequest {
127127
// The timers for infer request.
128128
triton::client::RequestTimers timer_;
129129
const std::string request_id_;
130+
bool final_response_sent_{false};
130131
};
131132

132133
class ChatCompletionClient : public HttpClient {
@@ -172,7 +173,7 @@ class ChatCompletionClient : public HttpClient {
172173
void* contents, size_t size, size_t nmemb, void* userp);
173174
static size_t ResponseHeaderHandler(
174175
void* contents, size_t size, size_t nmemb, void* userp);
175-
176+
176177
Error UpdateInferStat(const triton::client::RequestTimers& timer);
177178
InferStat infer_stat_;
178179
};

src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class PromptSource(Enum):
3636
class OutputFormat(Enum):
3737
OPENAI_CHAT_COMPLETIONS = auto()
3838
OPENAI_COMPLETIONS = auto()
39+
TRITON_GENERATE = auto()
3940
TENSORRTLLM = auto()
4041
VLLM = auto()
4142

@@ -356,7 +357,18 @@ def _convert_generic_json_to_output_format(
356357
output_tokens_deterministic: bool,
357358
model_name: str = "",
358359
) -> Dict:
359-
if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
360+
if output_format == OutputFormat.TRITON_GENERATE:
361+
output_json = cls._convert_generic_json_to_generate_format(
362+
generic_dataset,
363+
add_model_name,
364+
add_stream,
365+
extra_inputs,
366+
output_tokens_mean,
367+
output_tokens_stddev,
368+
output_tokens_deterministic,
369+
model_name,
370+
)
371+
elif output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
360372
output_json = cls._convert_generic_json_to_openai_chat_completions_format(
361373
generic_dataset,
362374
add_model_name,
@@ -440,6 +452,43 @@ def _convert_generic_json_to_openai_chat_completions_format(
440452

441453
return pa_json
442454

455+
@classmethod
456+
def _convert_generic_json_to_generate_format(
457+
cls,
458+
dataset_json: Dict,
459+
add_model_name: bool,
460+
add_stream: bool,
461+
extra_inputs: Dict,
462+
output_tokens_mean: int,
463+
output_tokens_stddev: int,
464+
output_tokens_deterministic: bool,
465+
model_name: str = "",
466+
) -> Dict:
467+
468+
(
469+
system_role_headers,
470+
user_role_headers,
471+
text_input_headers,
472+
) = cls._determine_json_feature_roles(dataset_json)
473+
474+
475+
pa_json = cls._populate_triton_generate_output_json(
476+
dataset_json,
477+
system_role_headers,
478+
user_role_headers,
479+
text_input_headers,
480+
add_model_name,
481+
add_stream,
482+
extra_inputs,
483+
output_tokens_mean,
484+
output_tokens_stddev,
485+
output_tokens_deterministic,
486+
model_name,
487+
)
488+
489+
return pa_json
490+
491+
443492
@classmethod
444493
def _convert_generic_json_to_openai_completions_format(
445494
cls,
@@ -617,6 +666,59 @@ def _populate_openai_chat_completions_output_json(
617666
)
618667

619668
return pa_json
669+
670+
@classmethod
671+
def _populate_triton_generate_output_json(
672+
cls,
673+
dataset: Dict,
674+
system_role_headers: List[str],
675+
user_role_headers: List[str],
676+
text_input_headers: List[str],
677+
add_model_name: bool,
678+
add_stream: bool,
679+
extra_inputs: Dict,
680+
output_tokens_mean: int,
681+
output_tokens_stddev: int,
682+
output_tokens_deterministic: bool,
683+
model_name: str = "",
684+
) -> Dict:
685+
number_of_rows = len(dataset["rows"])
686+
pa_json = cls._create_empty_trtllm_pa_json()
687+
688+
default_max_tokens = (
689+
"max_tokens" not in extra_inputs
690+
or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN
691+
)
692+
693+
pa_json = {"data":[{"payload":[{}]} for _ in dataset["rows"]]}
694+
695+
for index, entry in enumerate(dataset["rows"]):
696+
697+
for header, content in entry.items():
698+
new_text_input = cls._create_new_text_input(
699+
header,
700+
system_role_headers,
701+
user_role_headers,
702+
text_input_headers,
703+
content,
704+
)
705+
pa_json["data"][index]["payload"][0]["text_input"] = new_text_input
706+
707+
pa_json = cls._add_optional_tags_to_openai_json(
708+
pa_json,
709+
index,
710+
False,
711+
add_stream,
712+
extra_inputs,
713+
output_tokens_mean,
714+
output_tokens_stddev,
715+
output_tokens_deterministic,
716+
model_name,
717+
)
718+
719+
return pa_json
720+
721+
620722

621723
@classmethod
622724
def _populate_openai_completions_output_json(

src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class ResponseFormat(Enum):
4545
OPENAI_CHAT_COMPLETIONS = auto()
4646
OPENAI_COMPLETIONS = auto()
4747
TRITON = auto()
48+
TRITON_GENERATE = auto()
4849

4950

5051
class Metrics:
@@ -418,6 +419,8 @@ def _get_profile_metadata(self, data: dict) -> None:
418419
self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS
419420
elif data["endpoint"] == "v1/completions":
420421
self._response_format = ResponseFormat.OPENAI_COMPLETIONS
422+
elif "generate" in data["endpoint"]:
423+
self._response_format = ResponseFormat.TRITON_GENERATE
421424
else:
422425
# TPA-66: add PA metadata to handle this case
423426
# When endpoint field is either empty or custom endpoint, fall
@@ -633,6 +636,8 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:
633636
input_text = payload["messages"][0]["content"]
634637
elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS:
635638
input_text = payload["prompt"]
639+
elif self._response_format == ResponseFormat.TRITON_GENERATE:
640+
input_text = payload["text_input"]
636641
else:
637642
raise ValueError(
638643
"Failed to parse OpenAI request input in profile export file."
@@ -660,7 +665,10 @@ def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]
660665
"""Tokenize the OpenAI response output texts."""
661666
output_texts = []
662667
for output in res_outputs:
663-
text = self._extract_openai_text_output(output["response"])
668+
if self._response_format == ResponseFormat.TRITON_GENERATE:
669+
text = self._extract_generate_text_output(output["response"])
670+
else:
671+
text = self._extract_openai_text_output(output["response"])
664672
output_texts.append(text)
665673
return self._run_tokenizer(output_texts)
666674

@@ -673,6 +681,16 @@ def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]:
673681
encodings = self._tokenizer(output_texts)
674682
return [out[1:] for out in encodings.data["input_ids"]]
675683

684+
def _extract_generate_text_output(self, response: str) -> str:
685+
686+
response = remove_sse_prefix(response)
687+
688+
if response == "":
689+
return response
690+
691+
data = json.loads(response)
692+
return data["text_output"]
693+
676694
def _extract_openai_text_output(self, response: str) -> str:
677695
"""Extracts text/content of the OpenAI response object."""
678696
response = remove_sse_prefix(response)
@@ -702,7 +720,10 @@ def _extract_openai_text_output(self, response: str) -> str:
702720

703721
def _is_openai_empty_response(self, response: str) -> bool:
704722
"""Returns true if the response is an openai response with no content (or empty content)"""
705-
text = self._extract_openai_text_output(response)
723+
if self._response_format == ResponseFormat.TRITON_GENERATE:
724+
text = self._extract_generate_text_output(response)
725+
else:
726+
text = self._extract_openai_text_output(response)
706727
if text:
707728
return False
708729
return True

src/c++/perf_analyzer/genai-perf/genai_perf/parser.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646

4747
logger = logging.getLogger(__name__)
4848

49-
_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"}
49+
_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions", "generate":"v2/models/{MODEL_NAME}/generate"}
5050

5151

5252
def _check_model_args(
@@ -90,11 +90,13 @@ def _check_conditional_args(
9090
args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS
9191
elif args.endpoint_type == "completions":
9292
args.output_format = OutputFormat.OPENAI_COMPLETIONS
93+
elif args.endpoint_type == "generate":
94+
args.output_format = OutputFormat.TRITON_GENERATE
9395

9496
if args.endpoint is not None:
9597
args.endpoint = args.endpoint.lstrip(" /")
9698
else:
97-
args.endpoint = _endpoint_type_map[args.endpoint_type]
99+
args.endpoint = _endpoint_type_map[args.endpoint_type].format(MODEL_NAME=args.model)
98100
elif args.endpoint_type is not None:
99101
parser.error(
100102
"The --endpoint-type option should only be used when using the 'openai' service-kind."
@@ -368,7 +370,7 @@ def _add_endpoint_args(parser):
368370
endpoint_group.add_argument(
369371
"--endpoint-type",
370372
type=str,
371-
choices=["chat", "completions"],
373+
choices=["chat", "completions", "generate"],
372374
required=False,
373375
help=f"The endpoint-type to send requests to on the "
374376
'server. This is only used with the "openai" service-kind.',

0 commit comments

Comments
 (0)