basetenlabs · vshulman · May 13, 2024 · May 21, 2024
diff --git a/llama/llama-3-70b-instruct-trt-llm/README.md → ...rt-llm/fp8_tp2_i4096_o1024_bs30/README.md b/llama/llama-3-70b-instruct-trt-llm/README.md → ...rt-llm/fp8_tp2_i4096_o1024_bs30/README.md
diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/config.yaml b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/config.yaml
@@ -0,0 +1,42 @@
+apply_library_patches: true
+base_image:
+  image: docker.io/baseten/triton_trt_llm:4062d46_20240401
+  python_executable_path: /usr/bin/python3
+build:
+  arguments:
+    engine_repository: baseten/llama-3-70b_fp8_tp2_i4096_o1024_bs30-tllm_0.9.0.dev2024022000
+    pipeline_parallel_count: 1
+    tensor_parallel_count: 2
+    tokenizer_repository: baseten/Meta-Llama-3-tokenizer
+bundled_packages_dir: packages
+data_dir: data
+description: Generate text from a prompt with this eight billion parameter language
+  model.
+environment_variables: {}
+examples_filename: examples.yaml
+external_data: null
+external_package_dirs: []
+input_type: Any
+live_reload: false
+model_class_filename: model.py
+model_class_name: Model
+model_framework: custom
+model_metadata:
+  tags:
+  - text-generation
+  - openai-compatible
+model_module_dir: model
+model_name: Llama3 70B Instruct TRT-LLM
+model_type: Model
+python_version: py311
+requirements:
+- tritonclient[all]
+- transformers
+- jinja2
+resources:
+  accelerator: H100:2
+  use_gpu: true
+runtime:
+  num_workers: 1
+  predict_concurrency: 512
+
diff --git a/...-3-70b-instruct-trt-llm/model/__init__.py → ...p8_tp2_i4096_o1024_bs30/model/__init__.py b/...-3-70b-instruct-trt-llm/model/__init__.py → ...p8_tp2_i4096_o1024_bs30/model/__init__.py
diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/model.py b/llama/llama-3-70b-instruct-trt-llm/fp8_tp2_i4096_o1024_bs30/model/model.py
@@ -0,0 +1,112 @@
+import os
+from itertools import count
+
+import build_engine_utils
+from constants import (
+    GRPC_SERVICE_PORT,
+    HF_AUTH_KEY_CONSTANT,
+    HTTP_SERVICE_PORT,
+    TOKENIZER_KEY_CONSTANT,
+)
+from schema import ModelInput, TrussBuildConfig
+from transformers import AutoTokenizer
+from triton_client import TritonClient, TritonServer
+
+DEFAULT_MAX_TOKENS = 128
+DEFAULT_MAX_NEW_TOKENS = 128
+
+
+class Model:
+    def __init__(self, data_dir, config, secrets):
+        self._data_dir = data_dir
+        self._config = config
+        self._secrets = secrets
+        self._request_id_counter = count(start=1)
+        self.triton_client = None
+        self.triton_server = None
+        self.tokenizer = None
+        self.uses_openai_api = None
+
+    def load(self):
+        build_config = TrussBuildConfig(**self._config["build"]["arguments"])
+        self.uses_openai_api = "openai-compatible" in self._config.get(
+            "model_metadata", {}
+        ).get("tags", [])
+        hf_access_token = None
+        if "hf_access_token" in self._secrets._base_secrets.keys():
+            hf_access_token = self._secrets["hf_access_token"]
+
+        # TODO(Abu): Move to pre-runtime
+        if build_config.requires_build:
+            build_engine_utils.build_engine_from_config_args(
+                engine_build_args=build_config.engine_build_args,
+                dst=self._data_dir,
+            )
+
+        self.triton_server = TritonServer(
+            grpc_port=GRPC_SERVICE_PORT,
+            http_port=HTTP_SERVICE_PORT,
+        )
+
+        self.triton_server.create_model_repository(
+            truss_data_dir=self._data_dir,
+            engine_repository_path=build_config.engine_repository
+            if not build_config.requires_build
+            else None,
+            huggingface_auth_token=hf_access_token,
+        )
+
+        env = {}
+        if hf_access_token:
+            env[HF_AUTH_KEY_CONSTANT] = hf_access_token
+        env[TOKENIZER_KEY_CONSTANT] = build_config.tokenizer_repository
+
+        self.triton_server.start(
+            world_size=build_config.tensor_parallel_count,
+            env=env,
+        )
+
+        self.triton_client = TritonClient(
+            grpc_service_port=GRPC_SERVICE_PORT,
+        )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            build_config.tokenizer_repository, token=hf_access_token
+        )
+
+        self.eos_token_id = self.tokenizer.eos_token_id
+
+    async def predict(self, model_input):
+        if "messages" not in model_input and "prompt" not in model_input:
+            raise ValueError("Prompt or messages must be provided")
+
+        model_input.setdefault("max_tokens", DEFAULT_MAX_TOKENS)
+        model_input.setdefault("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
+        model_input["request_id"] = str(os.getpid()) + str(
+            next(self._request_id_counter)
+        )
+        model_input["eos_token_id"] = self.eos_token_id
+
+        if "messages" in model_input:
+            messages = model_input.pop("messages")
+            if self.uses_openai_api and "prompt" not in model_input:
+                model_input["prompt"] = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                )
+
+        self.triton_client.start_grpc_stream()
+        model_input = ModelInput(**model_input)
+        result_iterator = self.triton_client.infer(model_input)
+
+        async def generate():
+            async for result in result_iterator:
+                yield result
+
+        if model_input.stream:
+            return generate()
+        else:
+            if self.uses_openai_api:
+                return "".join(generate())
+            else:
+                return {"text": "".join(generate())}
diff --git a/...ct-trt-llm/packages/build_engine_utils.py → ...o1024_bs30/packages/build_engine_utils.py b/...ct-trt-llm/packages/build_engine_utils.py → ...o1024_bs30/packages/build_engine_utils.py
diff --git a/...0b-instruct-trt-llm/packages/constants.py → ...p2_i4096_o1024_bs30/packages/constants.py b/...0b-instruct-trt-llm/packages/constants.py → ...p2_i4096_o1024_bs30/packages/constants.py
diff --git a/...3-70b-instruct-trt-llm/packages/schema.py → ...8_tp2_i4096_o1024_bs30/packages/schema.py b/...3-70b-instruct-trt-llm/packages/schema.py → ...8_tp2_i4096_o1024_bs30/packages/schema.py
diff --git a/...lm_model_repository/ensemble/config.pbtxt → ...lm_model_repository/ensemble/config.pbtxt b/...lm_model_repository/ensemble/config.pbtxt → ...lm_model_repository/ensemble/config.pbtxt
diff --git a/...odel_repository/postprocessing/1/model.py → ...odel_repository/postprocessing/1/model.py b/...odel_repository/postprocessing/1/model.py → ...odel_repository/postprocessing/1/model.py
diff --git a/...el_repository/postprocessing/config.pbtxt → ...el_repository/postprocessing/config.pbtxt b/...el_repository/postprocessing/config.pbtxt → ...el_repository/postprocessing/config.pbtxt
diff --git a/...model_repository/preprocessing/1/model.py → ...model_repository/preprocessing/1/model.py b/...model_repository/preprocessing/1/model.py → ...model_repository/preprocessing/1/model.py
diff --git a/...del_repository/preprocessing/config.pbtxt → ...del_repository/preprocessing/config.pbtxt b/...del_repository/preprocessing/config.pbtxt → ...del_repository/preprocessing/config.pbtxt
diff --git a/...odel_repository/tensorrt_llm/config.pbtxt → ...odel_repository/tensorrt_llm/config.pbtxt b/...odel_repository/tensorrt_llm/config.pbtxt → ...odel_repository/tensorrt_llm/config.pbtxt
diff --git a/...nstruct-trt-llm/packages/triton_client.py → ...4096_o1024_bs30/packages/triton_client.py b/...nstruct-trt-llm/packages/triton_client.py → ...4096_o1024_bs30/packages/triton_client.py
diff --git a/...-3-70b-instruct-trt-llm/packages/utils.py → ...p8_tp2_i4096_o1024_bs30/packages/utils.py b/...-3-70b-instruct-trt-llm/packages/utils.py → ...p8_tp2_i4096_o1024_bs30/packages/utils.py
diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/README.md b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/README.md
@@ -0,0 +1,68 @@
+# LLaMA3-70B-Instruct Truss
+
+This is a [Truss](https://truss.baseten.co/) for an FP8 version of LLaMA3-70B-Instruct. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA3-70B-Instruct.
+
+**Warning: This example is only intended for usage on 4 H100s, changing your resource type for this deployment will result in unsupported behavior**
+
+## Truss
+
+Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten. Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten.
+
+## Deploying LLaMA3-70B-Instruct
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples/
+cd llama/llama-3-70b-instruct-trt-llm
+```
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+With `llama-3-70b-instruct-trt-llm` as your working directory, you can deploy the model with:
+
+```sh
+truss push --publish
+```
+
+Paste your Baseten API key if prompted.
+
+For more information, see [Truss documentation](https://truss.baseten.co).
+
+## LLaMA3-70B API documentation
+
+This section provides an overview of the LLaMA3-70B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction.
+
+### API route: `predict`
+
+We expect requests will the following information:
+
+- `prompt` (str): The prompt you'd like to complete
+- `max_tokens` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt.
+- `beam_width` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1.
+- `bad_words_list` (list, default:[]): A list of words to not include in generated output.
+- `stop_words_list` (list, default:[]): A list of words to stop generation upon encountering.
+- `repetition_penalty` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens.
+
+This Truss will stream responses back. Responses will be buffered chunks of text.
+
+## Example usage
+
+```sh
+truss predict -d '{"prompt": "What is the meaning of life?"}'
+```
+
+You can also invoke your model via a REST API
+
+```sh
+curl -X POST " https://app.baseten.co/models/YOUR_MODEL_ID/predict" \
+     -H "Content-Type: application/json" \
+     -H 'Authorization: Api-Key {YOUR_API_KEY}' \
+     -d '{
+           "prompt": "What's the meaning of life?",
+         }'
+
+```
diff --git a/.../llama-3-70b-instruct-trt-llm/config.yaml → ...-llm/fp8kv_tp4_i256_o128_bs64/config.yaml b/.../llama-3-70b-instruct-trt-llm/config.yaml → ...-llm/fp8kv_tp4_i256_o128_bs64/config.yaml
diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/model/__init__.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/model/__init__.py
diff --git a/...ama-3-70b-instruct-trt-llm/model/model.py → ...m/fp8kv_tp4_i256_o128_bs64/model/model.py b/...ama-3-70b-instruct-trt-llm/model/model.py → ...m/fp8kv_tp4_i256_o128_bs64/model/model.py
diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/build_engine_utils.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/build_engine_utils.py
@@ -0,0 +1,34 @@
+from pathlib import Path
+
+from schema import EngineBuildArgs
+
+
+def build_engine_from_config_args(
+    engine_build_args: EngineBuildArgs,
+    dst: Path,
+):
+    import os
+    import shutil
+    import sys
+
+    # NOTE: These are provided by the underlying base image
+    # TODO(Abu): Remove this when we have a better way of handling this
+    sys.path.append("/app/baseten")
+    from build_engine import Engine, build_engine
+    from trtllm_utils import docker_tag_aware_file_cache
+
+    engine = Engine(**engine_build_args.model_dump())
+
+    with docker_tag_aware_file_cache("/root/.cache/trtllm"):
+        built_engine = build_engine(engine, download_remote=True)
+
+        if not os.path.exists(dst):
+            os.makedirs(dst)
+
+        for filename in os.listdir(str(built_engine)):
+            source_file = os.path.join(str(built_engine), filename)
+            destination_file = os.path.join(dst, filename)
+            if not os.path.exists(destination_file):
+                shutil.copy(source_file, destination_file)
+
+        return dst
diff --git a/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/constants.py b/llama/llama-3-70b-instruct-trt-llm/fp8kv_tp4_i256_o128_bs64/packages/constants.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+# If changing model repo path, please updated inside tensorrt_llm config.pbtxt as well
+TENSORRT_LLM_MODEL_REPOSITORY_PATH = Path("/packages/tensorrt_llm_model_repository/")
+GRPC_SERVICE_PORT = 8001
+HTTP_SERVICE_PORT = 8003
+HF_AUTH_KEY_CONSTANT = "HUGGING_FACE_HUB_TOKEN"
+TOKENIZER_KEY_CONSTANT = "TRITON_TOKENIZER_REPOSITORY"
+ENTRYPOINT_MODEL_NAME = "ensemble"