diff --git a/llama/llama-3-70b-instruct-trt-llm/README.md b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/README.md similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/README.md rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/README.md diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/config.yaml b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/config.yaml new file mode 100644 index 000000000..70ed2b7b8 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/config.yaml @@ -0,0 +1,42 @@ +apply_library_patches: true +base_image: + image: docker.io/baseten/triton_trt_llm:4062d46_20240401 + python_executable_path: /usr/bin/python3 +build: + arguments: + engine_repository: baseten/llama-3-70b_fp8_tp2_i4096_o1024_bs30-tllm_0.9.0.dev2024022000 + pipeline_parallel_count: 1 + tensor_parallel_count: 2 + tokenizer_repository: baseten/Meta-Llama-3-tokenizer +bundled_packages_dir: packages +data_dir: data +description: Generate text from a prompt with this eight billion parameter language + model. +environment_variables: {} +examples_filename: examples.yaml +external_data: null +external_package_dirs: [] +input_type: Any +live_reload: false +model_class_filename: model.py +model_class_name: Model +model_framework: custom +model_metadata: + tags: + - text-generation + - openai-compatible +model_module_dir: model +model_name: Llama3 70B Instruct TRT-LLM +model_type: Model +python_version: py311 +requirements: +- tritonclient[all] +- transformers +- jinja2 +resources: + accelerator: H100:2 + use_gpu: true +runtime: + num_workers: 1 + predict_concurrency: 512 + diff --git a/llama/llama-3-70b-instruct-trt-llm/model/__init__.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/__init__.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/model/__init__.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/__init__.py diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/model.py new file mode 100644 index 000000000..ff4f19ed0 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/model.py @@ -0,0 +1,112 @@ +import os +from itertools import count + +import build_engine_utils +from constants import ( + GRPC_SERVICE_PORT, + HF_AUTH_KEY_CONSTANT, + HTTP_SERVICE_PORT, + TOKENIZER_KEY_CONSTANT, +) +from schema import ModelInput, TrussBuildConfig +from transformers import AutoTokenizer +from triton_client import TritonClient, TritonServer + +DEFAULT_MAX_TOKENS = 128 +DEFAULT_MAX_NEW_TOKENS = 128 + + +class Model: + def __init__(self, data_dir, config, secrets): + self._data_dir = data_dir + self._config = config + self._secrets = secrets + self._request_id_counter = count(start=1) + self.triton_client = None + self.triton_server = None + self.tokenizer = None + self.uses_openai_api = None + + def load(self): + build_config = TrussBuildConfig(**self._config["build"]["arguments"]) + self.uses_openai_api = "openai-compatible" in self._config.get( + "model_metadata", {} + ).get("tags", []) + hf_access_token = None + if "hf_access_token" in self._secrets._base_secrets.keys(): + hf_access_token = self._secrets["hf_access_token"] + + # TODO(Abu): Move to pre-runtime + if build_config.requires_build: + build_engine_utils.build_engine_from_config_args( + engine_build_args=build_config.engine_build_args, + dst=self._data_dir, + ) + + self.triton_server = TritonServer( + grpc_port=GRPC_SERVICE_PORT, + http_port=HTTP_SERVICE_PORT, + ) + + self.triton_server.create_model_repository( + truss_data_dir=self._data_dir, + engine_repository_path=build_config.engine_repository + if not build_config.requires_build + else None, + huggingface_auth_token=hf_access_token, + ) + + env = {} + if hf_access_token: + env[HF_AUTH_KEY_CONSTANT] = hf_access_token + env[TOKENIZER_KEY_CONSTANT] = build_config.tokenizer_repository + + self.triton_server.start( + world_size=build_config.tensor_parallel_count, + env=env, + ) + + self.triton_client = TritonClient( + grpc_service_port=GRPC_SERVICE_PORT, + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + build_config.tokenizer_repository, token=hf_access_token + ) + + self.eos_token_id = self.tokenizer.eos_token_id + + async def predict(self, model_input): + if "messages" not in model_input and "prompt" not in model_input: + raise ValueError("Prompt or messages must be provided") + + model_input.setdefault("max_tokens", DEFAULT_MAX_TOKENS) + model_input.setdefault("max_new_tokens", DEFAULT_MAX_NEW_TOKENS) + model_input["request_id"] = str(os.getpid()) + str( + next(self._request_id_counter) + ) + model_input["eos_token_id"] = self.eos_token_id + + if "messages" in model_input: + messages = model_input.pop("messages") + if self.uses_openai_api and "prompt" not in model_input: + model_input["prompt"] = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + ) + + self.triton_client.start_grpc_stream() + model_input = ModelInput(**model_input) + result_iterator = self.triton_client.infer(model_input) + + async def generate(): + async for result in result_iterator: + yield result + + if model_input.stream: + return generate() + else: + if self.uses_openai_api: + return "".join(generate()) + else: + return {"text": "".join(generate())} diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/build_engine_utils.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/build_engine_utils.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/build_engine_utils.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/build_engine_utils.py diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/constants.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/constants.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/constants.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/constants.py diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/schema.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/schema.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/schema.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/schema.py diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/postprocessing/1/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/postprocessing/1/model.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/postprocessing/1/model.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/postprocessing/1/model.py diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/preprocessing/1/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/preprocessing/1/model.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/preprocessing/1/model.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/preprocessing/1/model.py diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/triton_client.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/triton_client.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/triton_client.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/triton_client.py diff --git a/llama/llama-3-70b-instruct-trt-llm/packages/utils.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/utils.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/packages/utils.py rename to llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/packages/utils.py diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/README.md b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/README.md new file mode 100644 index 000000000..0b7639042 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/README.md @@ -0,0 +1,68 @@ +# LLaMA3-70B-Instruct Truss + +This is a [Truss](https://truss.baseten.co/) for an FP8 version of LLaMA3-70B-Instruct. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA3-70B-Instruct. + +**Warning: This example is only intended for usage on 4 H100s, changing your resource type for this deployment will result in unsupported behavior** + +## Truss + +Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten. Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten. + +## Deploying LLaMA3-70B-Instruct + +First, clone this repository: + +```sh +git clone https://github.com/basetenlabs/truss-examples/ +cd llama/llama-3-70b-instruct-trt-llm +``` + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + +With `llama-3-70b-instruct-trt-llm` as your working directory, you can deploy the model with: + +```sh +truss push --publish +``` + +Paste your Baseten API key if prompted. + +For more information, see [Truss documentation](https://truss.baseten.co). + +## LLaMA3-70B API documentation + +This section provides an overview of the LLaMA3-70B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction. + +### API route: `predict` + +We expect requests will the following information: + +- `prompt` (str): The prompt you'd like to complete +- `max_tokens` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. +- `beam_width` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. +- `bad_words_list` (list, default:[]): A list of words to not include in generated output. +- `stop_words_list` (list, default:[]): A list of words to stop generation upon encountering. +- `repetition_penalty` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. + +This Truss will stream responses back. Responses will be buffered chunks of text. + +## Example usage + +```sh +truss predict -d '{"prompt": "What is the meaning of life?"}' +``` + +You can also invoke your model via a REST API + +```sh +curl -X POST " https://app.baseten.co/models/YOUR_MODEL_ID/predict" \ + -H "Content-Type: application/json" \ + -H 'Authorization: Api-Key {YOUR_API_KEY}' \ + -d '{ + "prompt": "What's the meaning of life?", + }' + +``` diff --git a/llama/llama-3-70b-instruct-trt-llm/config.yaml b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/config.yaml similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/config.yaml rename to llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/config.yaml diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/model/__init__.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/llama/llama-3-70b-instruct-trt-llm/model/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/model/model.py similarity index 100% rename from llama/llama-3-70b-instruct-trt-llm/model/model.py rename to llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/model/model.py diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/build_engine_utils.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/build_engine_utils.py new file mode 100644 index 000000000..6442e3508 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/build_engine_utils.py @@ -0,0 +1,34 @@ +from pathlib import Path + +from schema import EngineBuildArgs + + +def build_engine_from_config_args( + engine_build_args: EngineBuildArgs, + dst: Path, +): + import os + import shutil + import sys + + # NOTE: These are provided by the underlying base image + # TODO(Abu): Remove this when we have a better way of handling this + sys.path.append("/app/baseten") + from build_engine import Engine, build_engine + from trtllm_utils import docker_tag_aware_file_cache + + engine = Engine(**engine_build_args.model_dump()) + + with docker_tag_aware_file_cache("/root/.cache/trtllm"): + built_engine = build_engine(engine, download_remote=True) + + if not os.path.exists(dst): + os.makedirs(dst) + + for filename in os.listdir(str(built_engine)): + source_file = os.path.join(str(built_engine), filename) + destination_file = os.path.join(dst, filename) + if not os.path.exists(destination_file): + shutil.copy(source_file, destination_file) + + return dst diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/constants.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/constants.py new file mode 100644 index 000000000..1f19e8065 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/constants.py @@ -0,0 +1,9 @@ +from pathlib import Path + +# If changing model repo path, please updated inside tensorrt_llm config.pbtxt as well +TENSORRT_LLM_MODEL_REPOSITORY_PATH = Path("/packages/tensorrt_llm_model_repository/") +GRPC_SERVICE_PORT = 8001 +HTTP_SERVICE_PORT = 8003 +HF_AUTH_KEY_CONSTANT = "HUGGING_FACE_HUB_TOKEN" +TOKENIZER_KEY_CONSTANT = "TRITON_TOKENIZER_REPOSITORY" +ENTRYPOINT_MODEL_NAME = "ensemble" diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/schema.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/schema.py new file mode 100644 index 000000000..ddc38f934 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/schema.py @@ -0,0 +1,230 @@ +from enum import Enum +from pathlib import Path +from typing import Optional + +import numpy as np +import tritonclient +import tritonclient.grpc.aio as grpcclient +from pydantic import BaseModel, ConfigDict, PrivateAttr + + +class ModelInput: + def __init__( + self, + prompt: str, + request_id: int, + max_tokens: int = 50, + max_new_tokens: int = 50, + temperature: float = 0.7, + top_p: float = 0.7, + top_k: int = 50, + beam_width: int = 1, + bad_words_list: Optional[list] = None, + stop_words_list: Optional[list] = None, + repetition_penalty: float = 1.0, + ignore_eos: bool = False, + stream: bool = True, + eos_token_id: int = None, # type: ignore + ) -> None: + self.stream = stream + self.request_id = request_id + self._prompt = prompt + self._max_tokens = max_tokens + self._max_new_tokens = max_new_tokens + self._temperature = temperature + self._top_p = top_p + self._top_k = top_k + self._beam_width = beam_width + self._bad_words_list = [""] if bad_words_list is None else bad_words_list + self._stop_words_list = [""] if stop_words_list is None else stop_words_list + self._repetition_penalty = repetition_penalty + self._eos_token_id = eos_token_id + self._ignore_eos = ignore_eos + + def _prepare_grpc_tensor( + self, name: str, input_data: np.ndarray + ) -> grpcclient.InferInput: + tensor = grpcclient.InferInput( + name, + input_data.shape, + tritonclient.utils.np_to_triton_dtype(input_data.dtype), + ) + tensor.set_data_from_numpy(input_data) + return tensor + + def to_tensors(self): + if self._eos_token_id is None and self._ignore_eos: + raise ValueError("eos_token_id is required when ignore_eos is True") + + prompt_data = np.array([[self._prompt]], dtype=object) + output_len_data = np.ones_like(prompt_data, dtype=np.uint32) * self._max_tokens + bad_words_data = np.array([self._bad_words_list], dtype=object) + stop_words_data = np.array([self._stop_words_list], dtype=object) + stream_data = np.array([[self.stream]], dtype=bool) + beam_width_data = np.array([[self._beam_width]], dtype=np.uint32) + repetition_penalty_data = np.array( + [[self._repetition_penalty]], dtype=np.float32 + ) + temperature_data = np.array([[self._temperature]], dtype=np.float32) + top_p_data = np.array([[self._top_p]], dtype=np.float32) + top_k_data = np.array([[self._top_k]], dtype=np.uint32) + + inputs = [ + self._prepare_grpc_tensor("text_input", prompt_data), + self._prepare_grpc_tensor("max_tokens", output_len_data), + self._prepare_grpc_tensor("bad_words", bad_words_data), + self._prepare_grpc_tensor("stop_words", stop_words_data), + self._prepare_grpc_tensor("stream", stream_data), + self._prepare_grpc_tensor("beam_width", beam_width_data), + self._prepare_grpc_tensor("repetition_penalty", repetition_penalty_data), + self._prepare_grpc_tensor("temperature", temperature_data), + self._prepare_grpc_tensor("top_p", top_p_data), + self._prepare_grpc_tensor("top_k", top_k_data), + ] + + if not self._ignore_eos: + end_id_data = np.array([[self._eos_token_id]], dtype=np.uint32) + inputs.append(self._prepare_grpc_tensor("end_id", end_id_data)) + + return inputs + + +class Quant(Enum): + NO_QUANT = "no_quant" + WEIGHTS_ONLY = "weights_only" + WEIGHTS_KV_INT8 = "weights_kv_int8" + SMOOTH_QUANT = "smooth_quant" + + +class EngineType(Enum): + LLAMA = "llama" + MISTRAL = "mistral" + + +class ArgsConfig(BaseModel): + max_input_len: Optional[int] = None + max_output_len: Optional[int] = None + max_batch_size: Optional[int] = None + tp_size: Optional[int] = None + pp_size: Optional[int] = None + world_size: Optional[int] = None + gather_all_token_logits: Optional[bool] = None + multi_block_mode: Optional[bool] = None + remove_input_padding: Optional[bool] = None + use_gpt_attention_plugin: Optional[str] = None + paged_kv_cache: Optional[bool] = None + use_inflight_batching: Optional[bool] = None + enable_context_fmha: Optional[bool] = None + use_gemm_plugin: Optional[str] = None + use_weight_only: Optional[bool] = None + output_dir: Optional[str] = None + model_dir: Optional[str] = None + ft_model_dir: Optional[str] = None + dtype: Optional[str] = None + int8_kv_cache: Optional[bool] = None + use_smooth_quant: Optional[bool] = None + per_token: Optional[bool] = None + per_channel: Optional[bool] = None + parallel_build: Optional[bool] = None + + # to disable warning because `model_dir` starts with `model_` prefix + model_config = ConfigDict(protected_namespaces=()) # type: ignore + + def as_command_arguments(self) -> list: + non_bool_args = [ + element + for arg, value in self.dict().items() + for element in [f"--{arg}", str(value)] + if value is not None and not isinstance(value, bool) + ] + bool_args = [ + f"--{arg}" + for arg, value in self.dict().items() + if isinstance(value, bool) and value + ] + return non_bool_args + bool_args + + +class CalibrationConfig(BaseModel): + kv_cache: Optional[bool] = None # either to calibrate kv cache + sq_alpha: Optional[float] = None + + def cache_path(self) -> Path: + if self.kv_cache is not None: + return Path("kv_cache") + else: + return Path(f"sq_{self.sq_alpha}") + + +class EngineBuildArgs(BaseModel, use_enum_values=True): + repo: Optional[str] = None + args: Optional[ArgsConfig] = None + quant: Optional[Quant] = None + calibration: Optional[CalibrationConfig] = None + engine_type: Optional[EngineType] = None + + +class TrussBuildConfig(BaseModel): + """ + This is a spec for what the config.yaml looks like to take advantage of TRT-LLM + TRT-LLM builds. We structure the + configuration with the below top-level keys. + + Example (for building an engine) + ``` + build: + model_server: TRT_LLM + arguments: + tokenizer_repository: "mistralai/mistral-v2-instruct" + arguments: + max_input_len: 1024 + max_output_len: 1024 + max_batch_size: 64 + quant: "weights_kv_int8" + tensor_parallel_count: 2 + pipeline_parallel_count: 1 + ``` + + Example (for using an existing engine) + ``` + build: + model_server: TRT_LLM + arguments: + engine_repository: "baseten/mistral-v2-32k" + tensor_parallel_count: 2 + pipeline_parallel_count: 1 + ``` + + """ + + tokenizer_repository: str + quant: Quant = Quant.NO_QUANT + pipeline_parallel_count: int = 1 + tensor_parallel_count: int = 1 + arguments: Optional[ArgsConfig] = None + engine_repository: Optional[str] = None + calibration: Optional[CalibrationConfig] = None + engine_type: Optional[EngineType] = None + _engine_build_args: Optional[EngineBuildArgs] = PrivateAttr(default=None) + + @property + def engine_build_args(self) -> EngineBuildArgs: + if self._engine_build_args is None: + repo = self.tokenizer_repository + quant = self.quant + calibration = self.calibration + engine_type = self.engine_type + args = self.arguments or ArgsConfig() + args.tp_size = self.tensor_parallel_count + args.pp_size = self.pipeline_parallel_count + self._engine_build_args = EngineBuildArgs( + repo=repo, + quant=quant, + calibration=calibration, + engine_type=engine_type, + args=args, + ) + return self._engine_build_args + + @property + def requires_build(self): + return self.engine_repository is None diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt new file mode 100644 index 000000000..618098de0 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/ensemble/config.pbtxt @@ -0,0 +1,246 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble" +platform: "ensemble" +max_batch_size: 2048 +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "max_tokens" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "bad_words" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "stop_words" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "preprocessing" + model_version: -1 + input_map { + key: "QUERY" + value: "text_input" + } + input_map { + key: "REQUEST_OUTPUT_LEN" + value: "max_tokens" + } + input_map { + key: "BAD_WORDS_DICT" + value: "bad_words" + } + input_map { + key: "STOP_WORDS_DICT" + value: "stop_words" + } + output_map { + key: "REQUEST_INPUT_LEN" + value: "_REQUEST_INPUT_LEN" + } + output_map { + key: "INPUT_ID" + value: "_INPUT_ID" + } + output_map { + key: "REQUEST_OUTPUT_LEN" + value: "_REQUEST_OUTPUT_LEN" + } + }, + { + model_name: "tensorrt_llm" + model_version: -1 + input_map { + key: "input_ids" + value: "_INPUT_ID" + } + input_map { + key: "input_lengths" + value: "_REQUEST_INPUT_LEN" + } + input_map { + key: "request_output_len" + value: "_REQUEST_OUTPUT_LEN" + } + input_map { + key: "end_id" + value: "end_id" + } + input_map { + key: "pad_id" + value: "pad_id" + } + input_map { + key: "runtime_top_k" + value: "top_k" + } + input_map { + key: "runtime_top_p" + value: "top_p" + } + input_map { + key: "temperature" + value: "temperature" + } + input_map { + key: "len_penalty" + value: "length_penalty" + } + input_map { + key: "repetition_penalty" + value: "repetition_penalty" + } + input_map { + key: "min_length" + value: "min_length" + } + input_map { + key: "presence_penalty" + value: "presence_penalty" + } + input_map { + key: "random_seed" + value: "random_seed" + } + input_map { + key: "beam_width" + value: "beam_width" + } + input_map { + key: "streaming" + value: "stream" + } + output_map { + key: "output_ids" + value: "_TOKENS_BATCH" + } + }, + { + model_name: "postprocessing" + model_version: -1 + input_map { + key: "TOKENS_BATCH" + value: "_TOKENS_BATCH" + } + output_map { + key: "OUTPUT" + value: "text_output" + } + } + ] +} diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/postprocessing/1/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/postprocessing/1/model.py new file mode 100644 index 000000000..fb637654a --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/postprocessing/1/model.py @@ -0,0 +1,181 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +from collections import OrderedDict + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + # NOTE: Keep this in sync with the truss model.py variable + tokenizer_dir = os.environ["TRITON_TOKENIZER_REPOSITORY"] + tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] + + if tokenizer_type == "t5": + self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") + elif tokenizer_type == "auto": + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, padding_side="left" + ) + elif tokenizer_type == "llama": + self.tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left" + ) + else: + raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + self.state_dict = OrderedDict() + # TODO(pankaj) This should come from the batch size + self.cache_size = 2048 + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get request ID + request_id = request.request_id() + + # Get input tensors + tokens_batch = ( + pb_utils.get_input_tensor_by_name(request, "TOKENS_BATCH") + .as_numpy() + .flatten() + ) + if len(tokens_batch) == 0: + continue + + # Postprocess output data + prev_token = self._get_prev_token(request_id) + self._store_prev_token(request_id, tokens_batch[-1]) + if prev_token is None: + delta = self.tokenizer.decode(tokens_batch) + else: + # TODO(pankaj) Figure out how to make tokenizer.decode not + # ignore initial whitespace so we can avoid this hack. + # Get string with and without previous token and diff. This hack + # is needed because tokenizer.decode strips initial whitespace. + old_string = self.tokenizer.decode([prev_token]) + with_prev_token = np.concatenate(([prev_token], tokens_batch)) + new_string = self.tokenizer.decode(with_prev_token) + delta = self._compute_delta(old_string, new_string) + + # Create output tensor + output_tensor = pb_utils.Tensor( + "OUTPUT", np.array([delta]).astype(self.output_dtype) + ) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor] + ) + responses.append(inference_response) + + return responses + + def finalize(self): + print("Cleaning up...") + + def _store_prev_token(self, request_id, token): + if request_id in self.state_dict: + self.state_dict[request_id]["prev_token"] = token + + # Move request ID to end of queue to prevent it from being evicted + self.state_dict.move_to_end(request_id) + else: + # Evict least recently used item if cache is full + if len(self.state_dict) > self.cache_size: + self.state_dict.popitem(last=False) + + self.state_dict[request_id] = {"prev_token": token} + + def _get_prev_token(self, request_id): + if request_id in self.state_dict: + return self.state_dict[request_id]["prev_token"] + return None + + def _compute_delta(self, prev_str, new_str): + delta = "".join( + [ + char + for index, char in enumerate(new_str) + if index >= len(prev_str) or char != prev_str[index] + ] + ) + return delta + + def _postprocessing(self, tokens): + decoded_tokens = self.tokenizer.decode(tokens) + return decoded_tokens diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt new file mode 100644 index 000000000..854ef9606 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/postprocessing/config.pbtxt @@ -0,0 +1,64 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "postprocessing" +backend: "python" +max_batch_size: 2048 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "NousResearch/Llama-2-7b-hf" + } +} + +parameters { + key: "tokenizer_type" + value: { + string_value: "auto" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/preprocessing/1/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/preprocessing/1/model.py new file mode 100644 index 000000000..fa4dcc2cd --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/preprocessing/1/model.py @@ -0,0 +1,260 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import csv +import json +import os +from typing import List + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + # NOTE: Keep this in sync with the truss model.py variable + tokenizer_dir = os.environ["TRITON_TOKENIZER_REPOSITORY"] + tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] + self.add_special_tokens = model_config["parameters"].get( + "add_special_tokens", {"string_value": "false"} + )["string_value"].lower() in ["true", "1", "t", "y", "yes"] + + if tokenizer_type == "t5": + self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") + elif tokenizer_type == "auto": + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, padding_side="left" + ) + elif tokenizer_type == "llama": + self.tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left" + ) + else: + raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.pad_id = self.tokenizer.encode( + self.tokenizer.pad_token, add_special_tokens=False + )[0] + + # Parse model output configs and convert Triton types to numpy types + input_names = [ + "INPUT_ID", + "REQUEST_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", + ] + for input_name in input_names: + setattr( + self, + input_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name(model_config, input_name)[ + "data_type" + ] + ), + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() + request_output_len = pb_utils.get_input_tensor_by_name( + request, "REQUEST_OUTPUT_LEN" + ).as_numpy() + + bad_words_dict = pb_utils.get_input_tensor_by_name( + request, "BAD_WORDS_DICT" + ).as_numpy() + stop_words_dict = pb_utils.get_input_tensor_by_name( + request, "STOP_WORDS_DICT" + ).as_numpy() + + # Preprocessing input data. + input_id, request_input_len = self._create_request(query) + bad_words = self._to_word_list_format(bad_words_dict) + stop_words = self._to_word_list_format(stop_words_dict) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + "INPUT_ID", np.array(input_id).astype(self.input_id_dtype) + ) + request_input_len_tensor = pb_utils.Tensor( + "REQUEST_INPUT_LEN", + np.array(request_input_len).astype(self.request_input_len_dtype), + ) + request_output_len_tensor = pb_utils.Tensor( + "REQUEST_OUTPUT_LEN", request_output_len + ) + bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) + stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, + bad_words_ids_tensor, + stop_words_ids_tensor, + request_input_len_tensor, + request_output_len_tensor, + ] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") + + def _create_request(self, query): + """ + query : batch string (2D numpy array) + """ + start_ids = [ + np.array( + self.tokenizer.encode( + s[0].decode(), add_special_tokens=self.add_special_tokens + ) + ).astype(int) + for s in query + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) + + max_len = 0 + for seq in start_ids: + max_len = max(max_len, seq.shape[0]) + start_ids = np.stack( + [ + np.pad( + seq, + (0, max_len - seq.shape[0]), + "constant", + constant_values=(0, self.pad_id), + ) + for seq in start_ids + ] + ) + + return start_ids, start_lengths + + def _to_word_list_format(self, word_dict: List[List[str]]): + """ + format of word_dict + len(word_dict) should be same to batch_size + word_dict[i] means the words for batch i + len(word_dict[i]) must be 1, which means it only contains 1 string + This string can contains several sentences and split by ",". + For example, if word_dict[2] = " I am happy, I am sad", then this function will return + the ids for two short sentences " I am happy" and " I am sad". + """ + assert self.tokenizer is not None, "need to set tokenizer" + + flat_ids = [] + offsets = [] + for word_dict_item in word_dict: + item_flat_ids = [] + item_offsets = [] + + if isinstance(word_dict_item[0], bytes): + word_dict_item = [word_dict_item[0].decode()] + + words = list(csv.reader(word_dict_item))[0] + for word in words: + ids = self.tokenizer.encode(word) + + if len(ids) == 0: + continue + + item_flat_ids += ids + item_offsets.append(len(ids)) + + flat_ids.append(np.array(item_flat_ids)) + offsets.append(np.cumsum(np.array(item_offsets))) + + pad_to = max(1, max(len(ids) for ids in flat_ids)) + + for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) + + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt new file mode 100644 index 000000000..1fb880124 --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/preprocessing/config.pbtxt @@ -0,0 +1,99 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "preprocessing" +backend: "python" +max_batch_size: 2048 +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "BAD_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "STOP_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "BAD_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "STOP_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "NousResearch/Llama-2-7b-hf" + } +} + +parameters { + key: "tokenizer_type" + value: { + string_value: "auto" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt new file mode 100644 index 000000000..75cb6718f --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/tensorrt_llm_model_repository/tensorrt_llm/config.pbtxt @@ -0,0 +1,208 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "tensorrt_llm" +backend: "tensorrtllm" +max_batch_size: 2048 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_UINT32 + dims: [ 1 ] + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "pad_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "min_length" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "streaming" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters: { + key: "max_beam_width" + value: { + string_value: "1" + } +} +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} +parameters: { + key: "gpt_model_type" + value: { + string_value: "inflight_fused_batching" + } +} +parameters: { + key: "gpt_model_path" + value: { + string_value: "/packages/tensorrt_llm_model_repository/tensorrt_llm/1" + } +} +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "100000" + } +} +parameters: { + key: "batch_scheduler_policy" + value: { + string_value: "max_utilization" + } +} +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: "0.9" + } +} +parameters: { + key: "max_num_sequences" + value: { + string_value: "2048" + } +} +parameters: { + key: "enable_trt_overlap" + value: { + string_value: "False" + } +} diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/triton_client.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/triton_client.py new file mode 100644 index 000000000..b7cca7c0b --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/triton_client.py @@ -0,0 +1,136 @@ +import json +import os +import subprocess +import time +from pathlib import Path +from typing import AsyncGenerator, Optional + +import tritonclient.grpc.aio as grpcclient +import tritonclient.http as httpclient +from constants import ( + ENTRYPOINT_MODEL_NAME, + GRPC_SERVICE_PORT, + TENSORRT_LLM_MODEL_REPOSITORY_PATH, +) +from schema import ModelInput +from utils import download_engine, prepare_model_repository + + +class TritonServer: + def __init__(self, grpc_port: int = 8001, http_port: int = 8003): + self.grpc_port = grpc_port + self.http_port = http_port + self._server_process = None + + def create_model_repository( + self, + truss_data_dir: Path, + engine_repository_path: Optional[str] = None, + huggingface_auth_token: Optional[str] = None, + ) -> None: + if engine_repository_path: + download_engine( + engine_repository=engine_repository_path, + fp=truss_data_dir, + auth_token=huggingface_auth_token, + ) + prepare_model_repository(truss_data_dir) + return + + def start(self, world_size: int = 1, env: dict = {}) -> None: + mpirun_command = ["mpirun", "--allow-run-as-root"] + mpi_commands = [] + for i in range(world_size): + mpi_command = [ + "-n", + "1", + "tritonserver", + f"--model-repository={TENSORRT_LLM_MODEL_REPOSITORY_PATH}", + f"--grpc-port={str(self.grpc_port)}", + f"--http-port={str(self.http_port)}", + "--disable-auto-complete-config", + f"--backend-config=python,shm-region-prefix-name=prefix{i}_", + ":", + ] + + mpi_commands.extend(mpi_command) + command = mpirun_command + mpi_commands + + self._server_process = subprocess.Popen( # type: ignore + command, + env={**os.environ, **env}, + ) + while not self.is_alive and not self.is_ready: + time.sleep(2) + return + + def stop(self): + if self._server_process: + if self.is_server_ready: + self._server_process.kill() + self._server_process = None + return + + @property + def is_alive(self) -> bool: + try: + http_client = httpclient.InferenceServerClient( + url=f"localhost:{self.http_port}", verbose=False + ) + return http_client.is_server_live() + except ConnectionRefusedError: + return False + + @property + def is_ready(self) -> bool: + try: + http_client = httpclient.InferenceServerClient( + url=f"localhost:{self.http_port}", verbose=False + ) + return http_client.is_model_ready(model_name=ENTRYPOINT_MODEL_NAME) + except ConnectionRefusedError: + return False + + +class TritonClient: + def __init__(self, grpc_service_port: int = GRPC_SERVICE_PORT): + self.grpc_service_port = grpc_service_port + self._grpc_client = None + + def start_grpc_stream(self) -> grpcclient.InferenceServerClient: + if self._grpc_client: + return self._grpc_client + + self._grpc_client = grpcclient.InferenceServerClient( + url=f"localhost:{self.grpc_service_port}", verbose=False + ) + return self._grpc_client + + async def infer( + self, model_input: ModelInput, model_name="ensemble" + ) -> AsyncGenerator[str, None]: + grpc_client_instance = self.start_grpc_stream() + inputs = model_input.to_tensors() + + async def input_generator(): + yield { + "model_name": model_name, + "inputs": inputs, + "request_id": model_input.request_id, + } + + response_iterator = grpc_client_instance.stream_infer( + inputs_iterator=input_generator(), + ) + + try: + async for response in response_iterator: + result, error = response + if result: + result = result.as_numpy("text_output") + yield result[0].decode("utf-8") + else: + yield json.dumps({"status": "error", "message": error.message()}) + + except grpcclient.InferenceServerException as e: + print(f"InferenceServerException: {e}") diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/utils.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/utils.py new file mode 100644 index 000000000..563f9119e --- /dev/null +++ b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/utils.py @@ -0,0 +1,43 @@ +from pathlib import Path + +from constants import TENSORRT_LLM_MODEL_REPOSITORY_PATH +from huggingface_hub import snapshot_download + + +def move_all_files(src: Path, dest: Path) -> None: + """ + Moves all files from `src` to `dest` recursively. + """ + for item in src.iterdir(): + dest_item = dest / item.name + if item.is_dir(): + dest_item.mkdir(parents=True, exist_ok=True) + move_all_files(item, dest_item) + else: + item.rename(dest_item) + + +def prepare_model_repository(data_dir: Path) -> None: + # Ensure the destination directory exists + dest_dir = TENSORRT_LLM_MODEL_REPOSITORY_PATH / "tensorrt_llm" / "1" + dest_dir.mkdir(parents=True, exist_ok=True) + + # Ensure empty version directory for `ensemble` model exists + ensemble_dir = TENSORRT_LLM_MODEL_REPOSITORY_PATH / "ensemble" / "1" + ensemble_dir.mkdir(parents=True, exist_ok=True) + + # Move all files and directories from data_dir to dest_dir + move_all_files(data_dir, dest_dir) + + +def download_engine(engine_repository: str, fp: Path, auth_token=None): + """ + Downloads the specified engine from Hugging Face Hub. + """ + snapshot_download( + engine_repository, + local_dir=fp, + local_dir_use_symlinks=False, + max_workers=4, + **({"use_auth_token": auth_token} if auth_token is not None else {}), + )