initial generate support

nnshah1 · nnshah1 · commit a2961895b954 · 2024-05-24T10:24:21.000-07:00
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -63,6 +63,14 @@ namespace openai {
 void
 ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
 {
+  // if final response has already been sent
+  // due to detecting the [DONE]
+  // ignore final response due to request completion
+  if (final_response_sent_) {
+    return;
+  }
+
+  final_response_sent_ = is_final;
   response_callback_(new ChatCompletionResult(
       http_code_, std::move(response_buffer_), is_final, is_null, request_id_));
 }
@@ -107,13 +115,15 @@ ChatCompletionClient::ResponseHeaderHandler(
       hdr.find("text/event-stream") != std::string::npos) {
     request->is_stream_ = true;
   }
+  
   return byte_size;
 }
 
 size_t
 ChatCompletionClient::ResponseHandler(
     void* contents, size_t size, size_t nmemb, void* userp)
 {
+  
   // [TODO TMA-1666] verify if the SSE responses received are complete, or the
   // response need to be stitched first. To verify, print out the received
   // responses from SendResponse() to make sure the OpenAI server doesn't chunk
@@ -151,7 +161,7 @@ ChatCompletionClient::ResponseHandler(
   // RECV_END so that we always have the time of the last.
   request->timer_.CaptureTimestamp(
       triton::client::RequestTimers::Kind::RECV_END);
-
+  
   return result_bytes;
 }
 
@@ -162,6 +172,8 @@ ChatCompletionClient::AsyncInfer(
     std::string& serialized_request_body, const std::string& request_id,
     const Headers& headers)
 {
+  
+  
   if (callback == nullptr) {
     return Error(
         "Callback function must be provided along with AsyncInfer() call.");
@@ -172,9 +184,14 @@ ChatCompletionClient::AsyncInfer(
     request->timer_.CaptureTimestamp(
         triton::client::RequestTimers::Kind::REQUEST_END);
     UpdateInferStat(request->timer_);
-    if (!request->is_stream_) {
-      request->SendResponse(true /* is_final */, false /* is_null */);
-    }
+
+    // Updated to be ok to call multiple times
+    // will only send the first final response
+    //
+    // if (!request->is_stream_) {
+    //   
+    request->SendResponse(true /* is_final */, false /* is_null */);
+    // }
   };
   std::unique_ptr<HttpRequest> request(new ChatCompletionRequest(
       std::move(completion_callback), std::move(callback), request_id,
@@ -185,7 +202,7 @@ ChatCompletionClient::AsyncInfer(
   request->AddInput(
       reinterpret_cast<uint8_t*>(serialized_request_body.data()),
       serialized_request_body.size());
-
+  
   CURL* multi_easy_handle = curl_easy_init();
   Error err = PreRunProcessing(multi_easy_handle, raw_request, headers);
   if (!err.IsOk()) {
@@ -226,7 +243,7 @@ ChatCompletionClient::PreRunProcessing(
 
   // response data handled by ResponseHandler()
   curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler);
-  curl_easy_setopt(curl, CURLOPT_WRITEDATA, request);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, request);  
 
   const curl_off_t post_byte_size = request->total_input_byte_size_;
   curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, post_byte_size);
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
@@ -127,6 +127,7 @@ class ChatCompletionRequest : public HttpRequest {
   // The timers for infer request.
   triton::client::RequestTimers timer_;
   const std::string request_id_;
+  bool final_response_sent_{false};
 };
 
 class ChatCompletionClient : public HttpClient {
@@ -172,7 +173,7 @@ class ChatCompletionClient : public HttpClient {
       void* contents, size_t size, size_t nmemb, void* userp);
   static size_t ResponseHeaderHandler(
       void* contents, size_t size, size_t nmemb, void* userp);
-
+  
   Error UpdateInferStat(const triton::client::RequestTimers& timer);
   InferStat infer_stat_;
 };
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
@@ -36,6 +36,7 @@ class PromptSource(Enum):
 class OutputFormat(Enum):
     OPENAI_CHAT_COMPLETIONS = auto()
     OPENAI_COMPLETIONS = auto()
+    TRITON_GENERATE = auto()
     TENSORRTLLM = auto()
     VLLM = auto()
 
@@ -356,7 +357,18 @@ def _convert_generic_json_to_output_format(
         output_tokens_deterministic: bool,
         model_name: str = "",
     ) -> Dict:
-        if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
+        if output_format == OutputFormat.TRITON_GENERATE:
+            output_json = cls._convert_generic_json_to_generate_format(
+                generic_dataset,
+                add_model_name,
+                add_stream,
+                extra_inputs,
+                output_tokens_mean,
+                output_tokens_stddev,
+                output_tokens_deterministic,
+                model_name,
+            )
+        elif output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
             output_json = cls._convert_generic_json_to_openai_chat_completions_format(
                 generic_dataset,
                 add_model_name,
@@ -440,6 +452,43 @@ def _convert_generic_json_to_openai_chat_completions_format(
 
         return pa_json
 
+    @classmethod
+    def _convert_generic_json_to_generate_format(
+            cls,
+            dataset_json: Dict,
+            add_model_name: bool,
+            add_stream: bool,
+            extra_inputs: Dict,
+            output_tokens_mean: int,
+            output_tokens_stddev: int,
+            output_tokens_deterministic: bool,
+            model_name: str = "",
+    ) -> Dict:
+
+        (
+            system_role_headers,
+            user_role_headers,
+            text_input_headers,
+        ) = cls._determine_json_feature_roles(dataset_json)
+
+
+        pa_json = cls._populate_triton_generate_output_json(
+            dataset_json,
+            system_role_headers,
+            user_role_headers,
+            text_input_headers,
+            add_model_name,
+            add_stream,
+            extra_inputs,
+            output_tokens_mean,
+            output_tokens_stddev,
+            output_tokens_deterministic,
+            model_name,
+        )
+
+        return pa_json
+
+
     @classmethod
     def _convert_generic_json_to_openai_completions_format(
         cls,
@@ -617,6 +666,59 @@ def _populate_openai_chat_completions_output_json(
             )
 
         return pa_json
+    
+    @classmethod
+    def _populate_triton_generate_output_json(
+            cls,
+            dataset: Dict,
+            system_role_headers: List[str],
+            user_role_headers: List[str],
+            text_input_headers: List[str],
+            add_model_name: bool,
+            add_stream: bool,
+            extra_inputs: Dict,
+            output_tokens_mean: int,
+            output_tokens_stddev: int,
+            output_tokens_deterministic: bool,
+            model_name: str = "",
+    ) -> Dict:
+        number_of_rows = len(dataset["rows"])
+        pa_json = cls._create_empty_trtllm_pa_json()
+
+        default_max_tokens = (
+            "max_tokens" not in extra_inputs
+            or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN
+        )
+
+        pa_json = {"data":[{"payload":[{}]} for _ in dataset["rows"]]}
+
+        for index, entry in enumerate(dataset["rows"]):
+        
+            for header, content in entry.items():
+                new_text_input = cls._create_new_text_input(
+                    header,
+                    system_role_headers,
+                    user_role_headers,
+                    text_input_headers,
+                    content,
+                )
+                pa_json["data"][index]["payload"][0]["text_input"] = new_text_input
+                
+            pa_json = cls._add_optional_tags_to_openai_json(
+                pa_json,
+                index,
+                False,
+                add_stream,
+                extra_inputs,
+                output_tokens_mean,
+                output_tokens_stddev,
+                output_tokens_deterministic,
+                model_name,
+            )
+
+        return pa_json
+
+            
 
     @classmethod
     def _populate_openai_completions_output_json(
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
@@ -45,6 +45,7 @@ class ResponseFormat(Enum):
     OPENAI_CHAT_COMPLETIONS = auto()
     OPENAI_COMPLETIONS = auto()
     TRITON = auto()
+    TRITON_GENERATE = auto()
 
 
 class Metrics:
@@ -418,6 +419,8 @@ def _get_profile_metadata(self, data: dict) -> None:
                 self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS
             elif data["endpoint"] == "v1/completions":
                 self._response_format = ResponseFormat.OPENAI_COMPLETIONS
+            elif "generate" in data["endpoint"]:
+                self._response_format = ResponseFormat.TRITON_GENERATE
             else:
                 # TPA-66: add PA metadata to handle this case
                 # When endpoint field is either empty or custom endpoint, fall
@@ -633,6 +636,8 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:
             input_text = payload["messages"][0]["content"]
         elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS:
             input_text = payload["prompt"]
+        elif self._response_format == ResponseFormat.TRITON_GENERATE:
+            input_text = payload["text_input"]
         else:
             raise ValueError(
                 "Failed to parse OpenAI request input in profile export file."
@@ -660,7 +665,10 @@ def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]
         """Tokenize the OpenAI response output texts."""
         output_texts = []
         for output in res_outputs:
-            text = self._extract_openai_text_output(output["response"])
+            if self._response_format == ResponseFormat.TRITON_GENERATE:
+                text = self._extract_generate_text_output(output["response"])
+            else:
+                text = self._extract_openai_text_output(output["response"])
             output_texts.append(text)
         return self._run_tokenizer(output_texts)
 
@@ -673,6 +681,16 @@ def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]:
         encodings = self._tokenizer(output_texts)
         return [out[1:] for out in encodings.data["input_ids"]]
 
+    def _extract_generate_text_output(self, response: str) -> str:
+
+        response = remove_sse_prefix(response)
+
+        if response == "":
+            return response
+
+        data = json.loads(response)
+        return data["text_output"]
+
     def _extract_openai_text_output(self, response: str) -> str:
         """Extracts text/content of the OpenAI response object."""
         response = remove_sse_prefix(response)
@@ -702,7 +720,10 @@ def _extract_openai_text_output(self, response: str) -> str:
 
     def _is_openai_empty_response(self, response: str) -> bool:
         """Returns true if the response is an openai response with no content (or empty content)"""
-        text = self._extract_openai_text_output(response)
+        if self._response_format == ResponseFormat.TRITON_GENERATE:
+            text = self._extract_generate_text_output(response)
+        else:
+            text = self._extract_openai_text_output(response)
         if text:
             return False
         return True
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
@@ -46,7 +46,7 @@
 
 logger = logging.getLogger(__name__)
 
-_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"}
+_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions", "generate":"v2/models/{MODEL_NAME}/generate"}
 
 
 def _check_model_args(
@@ -90,11 +90,13 @@ def _check_conditional_args(
                 args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS
             elif args.endpoint_type == "completions":
                 args.output_format = OutputFormat.OPENAI_COMPLETIONS
+            elif args.endpoint_type == "generate":
+                args.output_format = OutputFormat.TRITON_GENERATE
 
             if args.endpoint is not None:
                 args.endpoint = args.endpoint.lstrip(" /")
             else:
-                args.endpoint = _endpoint_type_map[args.endpoint_type]
+                args.endpoint = _endpoint_type_map[args.endpoint_type].format(MODEL_NAME=args.model)
     elif args.endpoint_type is not None:
         parser.error(
             "The --endpoint-type option should only be used when using the 'openai' service-kind."
@@ -368,7 +370,7 @@ def _add_endpoint_args(parser):
     endpoint_group.add_argument(
         "--endpoint-type",
         type=str,
-        choices=["chat", "completions"],
+        choices=["chat", "completions", "generate"],
         required=False,
         help=f"The endpoint-type to send requests to on the "
         'server. This is only used with the "openai" service-kind.',