Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apply_library_patches: true
base_image:
image: docker.io/baseten/triton_trt_llm:4062d46_20240401
python_executable_path: /usr/bin/python3
build:
arguments:
engine_repository: baseten/llama-3-70b_fp8_tp2_i4096_o1024_bs30-tllm_0.9.0.dev2024022000
pipeline_parallel_count: 1
tensor_parallel_count: 2
tokenizer_repository: baseten/Meta-Llama-3-tokenizer
bundled_packages_dir: packages
data_dir: data
description: Generate text from a prompt with this eight billion parameter language
model.
environment_variables: {}
examples_filename: examples.yaml
external_data: null
external_package_dirs: []
input_type: Any
live_reload: false
model_class_filename: model.py
model_class_name: Model
model_framework: custom
model_metadata:
tags:
- text-generation
- openai-compatible
model_module_dir: model
model_name: Llama3 70B Instruct TRT-LLM
model_type: Model
python_version: py311
requirements:
- tritonclient[all]
- transformers
- jinja2
resources:
accelerator: H100:2
use_gpu: true
runtime:
num_workers: 1
predict_concurrency: 512

Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import os
from itertools import count

import build_engine_utils
from constants import (
GRPC_SERVICE_PORT,
HF_AUTH_KEY_CONSTANT,
HTTP_SERVICE_PORT,
TOKENIZER_KEY_CONSTANT,
)
from schema import ModelInput, TrussBuildConfig
from transformers import AutoTokenizer
from triton_client import TritonClient, TritonServer

DEFAULT_MAX_TOKENS = 128
DEFAULT_MAX_NEW_TOKENS = 128


class Model:
def __init__(self, data_dir, config, secrets):
self._data_dir = data_dir
self._config = config
self._secrets = secrets
self._request_id_counter = count(start=1)
self.triton_client = None
self.triton_server = None
self.tokenizer = None
self.uses_openai_api = None

def load(self):
build_config = TrussBuildConfig(**self._config["build"]["arguments"])
self.uses_openai_api = "openai-compatible" in self._config.get(
"model_metadata", {}
).get("tags", [])
hf_access_token = None
if "hf_access_token" in self._secrets._base_secrets.keys():
hf_access_token = self._secrets["hf_access_token"]

# TODO(Abu): Move to pre-runtime
if build_config.requires_build:
build_engine_utils.build_engine_from_config_args(
engine_build_args=build_config.engine_build_args,
dst=self._data_dir,
)

self.triton_server = TritonServer(
grpc_port=GRPC_SERVICE_PORT,
http_port=HTTP_SERVICE_PORT,
)

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=build_config.engine_repository
if not build_config.requires_build
else None,
huggingface_auth_token=hf_access_token,
)

env = {}
if hf_access_token:
env[HF_AUTH_KEY_CONSTANT] = hf_access_token
env[TOKENIZER_KEY_CONSTANT] = build_config.tokenizer_repository

self.triton_server.start(
world_size=build_config.tensor_parallel_count,
env=env,
)

self.triton_client = TritonClient(
grpc_service_port=GRPC_SERVICE_PORT,
)

self.tokenizer = AutoTokenizer.from_pretrained(
build_config.tokenizer_repository, token=hf_access_token
)

self.eos_token_id = self.tokenizer.eos_token_id

async def predict(self, model_input):
if "messages" not in model_input and "prompt" not in model_input:
raise ValueError("Prompt or messages must be provided")

model_input.setdefault("max_tokens", DEFAULT_MAX_TOKENS)
model_input.setdefault("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
model_input["request_id"] = str(os.getpid()) + str(
next(self._request_id_counter)
)
model_input["eos_token_id"] = self.eos_token_id

if "messages" in model_input:
messages = model_input.pop("messages")
if self.uses_openai_api and "prompt" not in model_input:
model_input["prompt"] = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
)

self.triton_client.start_grpc_stream()
model_input = ModelInput(**model_input)
result_iterator = self.triton_client.infer(model_input)

async def generate():
async for result in result_iterator:
yield result

if model_input.stream:
return generate()
else:
if self.uses_openai_api:
return "".join(generate())
else:
return {"text": "".join(generate())}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# LLaMA3-70B-Instruct Truss

This is a [Truss](https://truss.baseten.co/) for an FP8 version of LLaMA3-70B-Instruct. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA3-70B-Instruct.

**Warning: This example is only intended for usage on 4 H100s, changing your resource type for this deployment will result in unsupported behavior**

## Truss

Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten. Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten.

## Deploying LLaMA3-70B-Instruct

First, clone this repository:

```sh
git clone https://github.com/basetenlabs/truss-examples/
cd llama/llama-3-70b-instruct-trt-llm
```

Before deployment:

1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
2. Install the latest version of Truss: `pip install --upgrade truss`

With `llama-3-70b-instruct-trt-llm` as your working directory, you can deploy the model with:

```sh
truss push --publish
```

Paste your Baseten API key if prompted.

For more information, see [Truss documentation](https://truss.baseten.co).

## LLaMA3-70B API documentation

This section provides an overview of the LLaMA3-70B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction.

### API route: `predict`

We expect requests will the following information:

- `prompt` (str): The prompt you'd like to complete
- `max_tokens` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt.
- `beam_width` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1.
- `bad_words_list` (list, default:[]): A list of words to not include in generated output.
- `stop_words_list` (list, default:[]): A list of words to stop generation upon encountering.
- `repetition_penalty` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens.

This Truss will stream responses back. Responses will be buffered chunks of text.

## Example usage

```sh
truss predict -d '{"prompt": "What is the meaning of life?"}'
```

You can also invoke your model via a REST API

```sh
curl -X POST " https://app.baseten.co/models/YOUR_MODEL_ID/predict" \
-H "Content-Type: application/json" \
-H 'Authorization: Api-Key {YOUR_API_KEY}' \
-d '{
"prompt": "What's the meaning of life?",
}'

```
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from pathlib import Path

from schema import EngineBuildArgs


def build_engine_from_config_args(
engine_build_args: EngineBuildArgs,
dst: Path,
):
import os
import shutil
import sys

# NOTE: These are provided by the underlying base image
# TODO(Abu): Remove this when we have a better way of handling this
sys.path.append("/app/baseten")
from build_engine import Engine, build_engine
from trtllm_utils import docker_tag_aware_file_cache

engine = Engine(**engine_build_args.model_dump())

with docker_tag_aware_file_cache("/root/.cache/trtllm"):
built_engine = build_engine(engine, download_remote=True)

if not os.path.exists(dst):
os.makedirs(dst)

for filename in os.listdir(str(built_engine)):
source_file = os.path.join(str(built_engine), filename)
destination_file = os.path.join(dst, filename)
if not os.path.exists(destination_file):
shutil.copy(source_file, destination_file)

return dst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pathlib import Path

# If changing model repo path, please updated inside tensorrt_llm config.pbtxt as well
TENSORRT_LLM_MODEL_REPOSITORY_PATH = Path("/packages/tensorrt_llm_model_repository/")
GRPC_SERVICE_PORT = 8001
HTTP_SERVICE_PORT = 8003
HF_AUTH_KEY_CONSTANT = "HUGGING_FACE_HUB_TOKEN"
TOKENIZER_KEY_CONSTANT = "TRITON_TOKENIZER_REPOSITORY"
ENTRYPOINT_MODEL_NAME = "ensemble"
Loading