diff --git a/samples/pipeline_model_repository1/gpt2/1/model.json b/samples/pipeline_model_repository1/gpt2/1/model.json new file mode 100644 index 00000000..96f398c4 --- /dev/null +++ b/samples/pipeline_model_repository1/gpt2/1/model.json @@ -0,0 +1 @@ +{"model": "gpt2", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/samples/pipeline_model_repository1/gpt2/config.pbtxt b/samples/pipeline_model_repository1/gpt2/config.pbtxt new file mode 100644 index 00000000..291fdff0 --- /dev/null +++ b/samples/pipeline_model_repository1/gpt2/config.pbtxt @@ -0,0 +1,2 @@ +backend: "vllm" +instance_group [{kind: KIND_MODEL}] diff --git a/samples/pipeline_model_repository1/gpt2bls/1/model.py b/samples/pipeline_model_repository1/gpt2bls/1/model.py new file mode 100644 index 00000000..a57c6159 --- /dev/null +++ b/samples/pipeline_model_repository1/gpt2bls/1/model.py @@ -0,0 +1,82 @@ + +import json +import numpy as np + + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args["model_config"]) + self.model_name = "gpt2" # Choose + + # Converts all input to uppercase + def preprocess(self, request): + text_input = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy() + # Pre-Processing Step (Making input all Uppercase) + processed_text = text_input[0].upper() + + preprocessed_tensor = pb_utils.Tensor( + "text_input", # Name of input tensor for gpt2 + np.array([processed_text]) + ) + + return preprocessed_tensor + + def execute(self, requests): + response = [] + + for request in requests: + + preprocessed_tensor = self.preprocess(request) + + infer_request = pb_utils.InferenceRequest( + model_name=self.model_name, + requested_output_names=["text_output"], + inputs=[preprocessed_tensor] + ) + + + infer_responses = infer_request.exec(decoupled=True) + + + response_text = "" + + for infer_response in infer_responses: + # If inference response has an error, raise an exception + if infer_response.has_error(): + raise pb_utils.TritonModelException(infer_response.error().message()) + + # Check for the last empty response. + if len(infer_response.output_tensors()) > 0: + response_text += pb_utils.get_output_tensor_by_name( + infer_response, "text_output" + ).as_numpy()[0].decode("utf-8") # Convert bytes to string + + + # Post-Processing Step (Adding Model Name as Prefix) + output_tensor = self.postprocess(text_output=response_text) + response += [ + pb_utils.InferenceResponse( + output_tensors=[output_tensor] + ) + ] + + # Since the model is using the default mode in this example, we + # will be returning a single response. + return response + + def postprocess(self, text_output): + # Post-Processing Step: Adding Model Name as Prefix + output_text = f"{self.model_name}: {text_output}" + return pb_utils.Tensor("text_output", np.array([output_text.encode()])) # Convert string to bytes + + + def finalize(self): + print("Cleaning up...") \ No newline at end of file diff --git a/samples/pipeline_model_repository1/gpt2bls/config.pbtxt b/samples/pipeline_model_repository1/gpt2bls/config.pbtxt new file mode 100644 index 00000000..8f5fda99 --- /dev/null +++ b/samples/pipeline_model_repository1/gpt2bls/config.pbtxt @@ -0,0 +1,19 @@ +name: "gpt2bls" +backend: "python" + +input [ +{ + name: "text_input" + data_type: TYPE_STRING + dims: [ 1 ] +} +] +output [ +{ + name: "text_output" + data_type: TYPE_STRING + dims: [ 1 ] +} +] + +instance_group [{ kind: KIND_CPU }] \ No newline at end of file