Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions samples/pipeline_model_repository1/gpt2/1/model.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"model": "gpt2", "disable_log_requests": true, "gpu_memory_utilization": 0.85}
2 changes: 2 additions & 0 deletions samples/pipeline_model_repository1/gpt2/config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
backend: "vllm"
instance_group [{kind: KIND_MODEL}]
82 changes: 82 additions & 0 deletions samples/pipeline_model_repository1/gpt2bls/1/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@

import json
import numpy as np


# triton_python_backend_utils is available in every Triton Python model. You
# need to use this module to create inference requests and responses. It also
# contains some utility functions for extracting information from model_config
# and converting Triton input/output types to numpy types.
import triton_python_backend_utils as pb_utils


class TritonPythonModel:
def initialize(self, args):
# You must parse model_config. JSON string is not parsed here
self.model_config = json.loads(args["model_config"])
self.model_name = "gpt2" # Choose

# Converts all input to uppercase
def preprocess(self, request):
text_input = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy()
# Pre-Processing Step (Making input all Uppercase)
processed_text = text_input[0].upper()

preprocessed_tensor = pb_utils.Tensor(
"text_input", # Name of input tensor for gpt2
np.array([processed_text])
)

return preprocessed_tensor

def execute(self, requests):
response = []

for request in requests:

preprocessed_tensor = self.preprocess(request)

infer_request = pb_utils.InferenceRequest(
model_name=self.model_name,
requested_output_names=["text_output"],
inputs=[preprocessed_tensor]
)


infer_responses = infer_request.exec(decoupled=True)


response_text = ""

for infer_response in infer_responses:
# If inference response has an error, raise an exception
if infer_response.has_error():
raise pb_utils.TritonModelException(infer_response.error().message())

# Check for the last empty response.
if len(infer_response.output_tensors()) > 0:
response_text += pb_utils.get_output_tensor_by_name(
infer_response, "text_output"
).as_numpy()[0].decode("utf-8") # Convert bytes to string


# Post-Processing Step (Adding Model Name as Prefix)
output_tensor = self.postprocess(text_output=response_text)
response += [
pb_utils.InferenceResponse(
output_tensors=[output_tensor]
)
]

# Since the model is using the default mode in this example, we
# will be returning a single response.
return response

def postprocess(self, text_output):
# Post-Processing Step: Adding Model Name as Prefix
output_text = f"{self.model_name}: {text_output}"
return pb_utils.Tensor("text_output", np.array([output_text.encode()])) # Convert string to bytes


def finalize(self):
print("Cleaning up...")
19 changes: 19 additions & 0 deletions samples/pipeline_model_repository1/gpt2bls/config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: "gpt2bls"
backend: "python"

input [
{
name: "text_input"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
output [
{
name: "text_output"
data_type: TYPE_STRING
dims: [ 1 ]
}
]

instance_group [{ kind: KIND_CPU }]
Loading