From c4b9ca631ad814908103cabb34ccb7c7056e7705 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Thu, 24 Jul 2025 15:24:53 -0400 Subject: [PATCH 01/18] Two vllm methods to deploy --- vllm/README.md | 7 +- .../README.md} | 0 vllm/openai_compatible/config.yaml | 36 +++ vllm/vllm_truss/README.md | 213 ++++++++++++++++++ vllm/{ => vllm_truss}/config.yaml | 0 vllm/vllm_truss/model/__init__.py | 0 vllm/{ => vllm_truss}/model/helper.py | 0 vllm/{ => vllm_truss}/model/model.py | 0 8 files changed, 255 insertions(+), 1 deletion(-) rename vllm/{model/__init__.py => openai_compatible/README.md} (100%) create mode 100644 vllm/openai_compatible/config.yaml create mode 100644 vllm/vllm_truss/README.md rename vllm/{ => vllm_truss}/config.yaml (100%) create mode 100644 vllm/vllm_truss/model/__init__.py rename vllm/{ => vllm_truss}/model/helper.py (100%) rename vllm/{ => vllm_truss}/model/model.py (100%) diff --git a/vllm/README.md b/vllm/README.md index 17fc3f903..3efdacc10 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -2,7 +2,12 @@ ## What is this Truss example doing -This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). We create this example to give you the most codeless experience, so you can configure all vLLM engine parameters in `config.yaml`, without making code changes in `model.py` for most of the use cases. +This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). + +## Two options + +### Vllm server via vllm serve +This is a completely codeless solution that 99% of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in vLLM serve. ## Configure your Truss by modifying the config.yaml diff --git a/vllm/model/__init__.py b/vllm/openai_compatible/README.md similarity index 100% rename from vllm/model/__init__.py rename to vllm/openai_compatible/README.md diff --git a/vllm/openai_compatible/config.yaml b/vllm/openai_compatible/config.yaml new file mode 100644 index 000000000..56dd7a754 --- /dev/null +++ b/vllm/openai_compatible/config.yaml @@ -0,0 +1,36 @@ +description: Llama 3.1 8B Instruct model is lightweight, multilingual and fine-tuned on human preferences for safety and helpfulness. +base_image: + image: vllm/vllm-openai:v0.9.2 +model_metadata: + repo_id: meta-llama/Llama-3.1-8B-Instruct + example_model_input: { + "model": "", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is the meaning of life?" + } + ] + } + ] + } + tags: + - openai-compatible +docker_server: + start_command: sh -c "HF_TOKEN=$(cat /secrets/hf_access_token) vllm serve meta-llama/Llama-3.1-8B-Instruct --dtype half --max-model-len 65536 --port 8000 --served-model-name llama --tensor-parallel-size 1 --gpu-memory-utilization 0.95" + readiness_endpoint: /health + liveness_endpoint: /health + predict_endpoint: /v1/chat/completions + server_port: 8000 +resources: + accelerator: H100_40GB + use_gpu: true +runtime: + predict_concurrency : 16 +model_name: Llama 3.1 8B Instruct +secrets: + hf_access_token: null +requirements: [] diff --git a/vllm/vllm_truss/README.md b/vllm/vllm_truss/README.md new file mode 100644 index 000000000..17fc3f903 --- /dev/null +++ b/vllm/vllm_truss/README.md @@ -0,0 +1,213 @@ +# vLLM Truss to deploy chat completion model + +## What is this Truss example doing + +This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). We create this example to give you the most codeless experience, so you can configure all vLLM engine parameters in `config.yaml`, without making code changes in `model.py` for most of the use cases. + +## Configure your Truss by modifying the config.yaml + +### Basic options using 1 GPU + +Here is the minimum config file you will need to deploy a model using vLLM on 1 GPU. +The only parameters you need to touch are: +- `model_name` +- `repo_id` +- `accelerator` + +``` +model_name: "Llama 3.1 8B Instruct VLLM" +python_version: py311 +model_metadata: + example_model_input: {"prompt": "what is the meaning of life"} + repo_id: meta-llama/Llama-3.1-8B-Instruct + openai_compatible: true + vllm_config: null +requirements: + - vllm==0.5.4 +resources: + accelerator: A100 + use_gpu: true +runtime: + predict_concurrency: 128 +secrets: + hf_access_token: null +``` + +### Basic options using multiple GPUs + +If your model needs more than 1 GPU to run using tensor parallel, you will need to change `accelerator`, and to set `tensor_parallel_size` and `distributed_executor_backend` accordingly. + +``` +model_name: "Llama 3.1 8B Instruct VLLM" +python_version: py311 +model_metadata: + example_model_input: {"prompt": "what is the meaning of life"} + repo_id: meta-llama/Llama-3.1-8B-Instruct + openai_compatible: false + vllm_config: + tensor_parallel_size: 4 + max_model_len: 4096 + distributed_executor_backend: mp +requirements: + - vllm==0.5.4 +resources: + accelerator: A10G:4 + use_gpu: true +runtime: + predict_concurrency: 128 +secrets: + hf_access_token: null +``` + +### Use vLLM's OpenAI compatible server + +To use vLLM in [OpenAI compatible server](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) mode, simply set `openai_compatible: true` under `model_metadata`. + +### Customize vLLM engine parameters + +For advanced users who want to override [vLLM engine arguments](https://docs.vllm.ai/en/latest/models/engine_args.html), you can add all arguments to `vllm_config` under `model_metadata`. + +#### Example 1: using model quantization + +``` +model_name: Mistral 7B v2 vLLM AWQ - T4 +environment_variables: {} +external_package_dirs: [] +model_metadata: + repo_id: TheBloke/Mistral-7B-Instruct-v0.2-AWQ + vllm_config: + quantization: "awq" + dtype: "float16" + max_model_len: 8000 + max_num_seqs: 8 +python_version: py310 +requirements: + - vllm==0.5.4 +resources: + accelerator: T4 + use_gpu: true +secrets: + hf_access_token: null +system_packages: [] +runtime: + predict_concurrency: 128 +``` + +#### Example 2: using customized vLLM image + +You can even override with your own customized vLLM docker image to work with models that are not supported yet by vanilla vLLM. + +``` +model_name: Ultravox v0.2 +base_image: + image: vshulman/vllm-openai-fixie:latest + python_executable_path: /usr/bin/python3 +model_metadata: + repo_id: fixie-ai/ultravox-v0.2 + vllm_config: + audio_token_id: 128002 +environment_variables: {} +external_package_dirs: [] +python_version: py310 +runtime: + predict_concurrency: 512 +requirements: + - httpx +resources: + accelerator: A100 + use_gpu: true +secrets: + hf_access_token: null +system_packages: +- python3.10-venv +``` + +## Deploy your Truss + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` +3. With `vllm` as your working directory, you can deploy the model with: + + ```sh + truss push --trusted + ``` + + Paste your Baseten API key if prompted. + +For more information, see [Truss documentation](https://truss.baseten.co). + +## Call your model + +Once your deployment is up, there are [many ways](https://docs.baseten.co/invoke/quickstart) to call your model. + +### curl command + +#### If you are NOT using OpenAI compatible server + +``` +curl -X POST https://model-.api.baseten.co/development/predict \ + -H "Authorization: Api-Key $BASETEN_API_KEY" \ + -d '{"prompt": "what is the meaning of life"}' +``` + + +#### If you are using OpenAI compatible server + +``` +curl -X POST "https://model-.api.baseten.co/development/predict" \ + -H "Content-Type: application/json" \ + -H 'Authorization: Api-Key {BASETEN_API_KEY}' \ + -d '{ + "messages": [{"role": "user", "content": "What even is AGI?"}], + "max_tokens": 256 + }' +``` + +To access [production metrics](https://docs.vllm.ai/en/latest/serving/metrics.html) reported by OpenAI compatible server, simply add `metrics: true` to the request. + +``` +curl -X POST "https://model-.api.baseten.co/development/predict" \ + -H "Content-Type: application/json" \ + -H 'Authorization: Api-Key {BASETEN_API_KEY}' \ + -d '{ + "metrics": true + }' +``` + +### OpenAI SDK (if you are using OpenAI compatible server) + +``` +from openai import OpenAI +import os + +model_id = "abcd1234" # Replace with your model ID +deployment_id = "4321cbda" # [Optional] Replace with your deployment ID + +client = OpenAI( + api_key=os.environ["BASETEN_API_KEY"], + base_url=f"https://bridge.baseten.co/{model_id}/v1/direct" +) + +response = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "user", "content": "Who won the world series in 2020?"}, + {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}, + {"role": "user", "content": "Where was it played?"} + ], + extra_body={ + "baseten": { + "model_id": model_id, + "deployment_id": deployment_id + } + } +) +print(response.choices[0].message.content) + +``` + +For more information, see [API reference](https://docs.baseten.co/api-reference/openai). + +## Support + +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/vllm/config.yaml b/vllm/vllm_truss/config.yaml similarity index 100% rename from vllm/config.yaml rename to vllm/vllm_truss/config.yaml diff --git a/vllm/vllm_truss/model/__init__.py b/vllm/vllm_truss/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm/model/helper.py b/vllm/vllm_truss/model/helper.py similarity index 100% rename from vllm/model/helper.py rename to vllm/vllm_truss/model/helper.py diff --git a/vllm/model/model.py b/vllm/vllm_truss/model/model.py similarity index 100% rename from vllm/model/model.py rename to vllm/vllm_truss/model/model.py From 11e56ad07ceca2486c99f019a0d5f9f33ae475cc Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Thu, 24 Jul 2025 17:46:18 -0400 Subject: [PATCH 02/18] Document the best method of using vllm serve --- vllm/README.md | 211 +----------------- vllm/openai_compatible/README.md | 0 vllm/{vllm_truss => truss_server}/README.md | 0 vllm/{vllm_truss => truss_server}/config.yaml | 0 .../model/__init__.py | 0 .../model/helper.py | 0 .../model/model.py | 0 vllm/vllm_server/README.md | 28 +++ .../config.yaml | 0 9 files changed, 31 insertions(+), 208 deletions(-) delete mode 100644 vllm/openai_compatible/README.md rename vllm/{vllm_truss => truss_server}/README.md (100%) rename vllm/{vllm_truss => truss_server}/config.yaml (100%) rename vllm/{vllm_truss => truss_server}/model/__init__.py (100%) rename vllm/{vllm_truss => truss_server}/model/helper.py (100%) rename vllm/{vllm_truss => truss_server}/model/model.py (100%) create mode 100644 vllm/vllm_server/README.md rename vllm/{openai_compatible => vllm_server}/config.yaml (100%) diff --git a/vllm/README.md b/vllm/README.md index 3efdacc10..f1b3b1830 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -7,212 +7,7 @@ This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an ## Two options ### Vllm server via vllm serve -This is a completely codeless solution that 99% of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in vLLM serve. +This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. See the -## Configure your Truss by modifying the config.yaml - -### Basic options using 1 GPU - -Here is the minimum config file you will need to deploy a model using vLLM on 1 GPU. -The only parameters you need to touch are: -- `model_name` -- `repo_id` -- `accelerator` - -``` -model_name: "Llama 3.1 8B Instruct VLLM" -python_version: py311 -model_metadata: - example_model_input: {"prompt": "what is the meaning of life"} - repo_id: meta-llama/Llama-3.1-8B-Instruct - openai_compatible: true - vllm_config: null -requirements: - - vllm==0.5.4 -resources: - accelerator: A100 - use_gpu: true -runtime: - predict_concurrency: 128 -secrets: - hf_access_token: null -``` - -### Basic options using multiple GPUs - -If your model needs more than 1 GPU to run using tensor parallel, you will need to change `accelerator`, and to set `tensor_parallel_size` and `distributed_executor_backend` accordingly. - -``` -model_name: "Llama 3.1 8B Instruct VLLM" -python_version: py311 -model_metadata: - example_model_input: {"prompt": "what is the meaning of life"} - repo_id: meta-llama/Llama-3.1-8B-Instruct - openai_compatible: false - vllm_config: - tensor_parallel_size: 4 - max_model_len: 4096 - distributed_executor_backend: mp -requirements: - - vllm==0.5.4 -resources: - accelerator: A10G:4 - use_gpu: true -runtime: - predict_concurrency: 128 -secrets: - hf_access_token: null -``` - -### Use vLLM's OpenAI compatible server - -To use vLLM in [OpenAI compatible server](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) mode, simply set `openai_compatible: true` under `model_metadata`. - -### Customize vLLM engine parameters - -For advanced users who want to override [vLLM engine arguments](https://docs.vllm.ai/en/latest/models/engine_args.html), you can add all arguments to `vllm_config` under `model_metadata`. - -#### Example 1: using model quantization - -``` -model_name: Mistral 7B v2 vLLM AWQ - T4 -environment_variables: {} -external_package_dirs: [] -model_metadata: - repo_id: TheBloke/Mistral-7B-Instruct-v0.2-AWQ - vllm_config: - quantization: "awq" - dtype: "float16" - max_model_len: 8000 - max_num_seqs: 8 -python_version: py310 -requirements: - - vllm==0.5.4 -resources: - accelerator: T4 - use_gpu: true -secrets: - hf_access_token: null -system_packages: [] -runtime: - predict_concurrency: 128 -``` - -#### Example 2: using customized vLLM image - -You can even override with your own customized vLLM docker image to work with models that are not supported yet by vanilla vLLM. - -``` -model_name: Ultravox v0.2 -base_image: - image: vshulman/vllm-openai-fixie:latest - python_executable_path: /usr/bin/python3 -model_metadata: - repo_id: fixie-ai/ultravox-v0.2 - vllm_config: - audio_token_id: 128002 -environment_variables: {} -external_package_dirs: [] -python_version: py310 -runtime: - predict_concurrency: 512 -requirements: - - httpx -resources: - accelerator: A100 - use_gpu: true -secrets: - hf_access_token: null -system_packages: -- python3.10-venv -``` - -## Deploy your Truss - -1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). -2. Install the latest version of Truss: `pip install --upgrade truss` -3. With `vllm` as your working directory, you can deploy the model with: - - ```sh - truss push --trusted - ``` - - Paste your Baseten API key if prompted. - -For more information, see [Truss documentation](https://truss.baseten.co). - -## Call your model - -Once your deployment is up, there are [many ways](https://docs.baseten.co/invoke/quickstart) to call your model. - -### curl command - -#### If you are NOT using OpenAI compatible server - -``` -curl -X POST https://model-.api.baseten.co/development/predict \ - -H "Authorization: Api-Key $BASETEN_API_KEY" \ - -d '{"prompt": "what is the meaning of life"}' -``` - - -#### If you are using OpenAI compatible server - -``` -curl -X POST "https://model-.api.baseten.co/development/predict" \ - -H "Content-Type: application/json" \ - -H 'Authorization: Api-Key {BASETEN_API_KEY}' \ - -d '{ - "messages": [{"role": "user", "content": "What even is AGI?"}], - "max_tokens": 256 - }' -``` - -To access [production metrics](https://docs.vllm.ai/en/latest/serving/metrics.html) reported by OpenAI compatible server, simply add `metrics: true` to the request. - -``` -curl -X POST "https://model-.api.baseten.co/development/predict" \ - -H "Content-Type: application/json" \ - -H 'Authorization: Api-Key {BASETEN_API_KEY}' \ - -d '{ - "metrics": true - }' -``` - -### OpenAI SDK (if you are using OpenAI compatible server) - -``` -from openai import OpenAI -import os - -model_id = "abcd1234" # Replace with your model ID -deployment_id = "4321cbda" # [Optional] Replace with your deployment ID - -client = OpenAI( - api_key=os.environ["BASETEN_API_KEY"], - base_url=f"https://bridge.baseten.co/{model_id}/v1/direct" -) - -response = client.chat.completions.create( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "user", "content": "Who won the world series in 2020?"}, - {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}, - {"role": "user", "content": "Where was it played?"} - ], - extra_body={ - "baseten": { - "model_id": model_id, - "deployment_id": deployment_id - } - } -) -print(response.choices[0].message.content) - -``` - -For more information, see [API reference](https://docs.baseten.co/api-reference/openai). - -## Support - -If you have any questions or need assistance, please open an issue in this repository or contact our support team. +### Vllm using truss server +This solution is for a more custom deployment where you require flexibility such as custom logic in your predictions. \ No newline at end of file diff --git a/vllm/openai_compatible/README.md b/vllm/openai_compatible/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/vllm/vllm_truss/README.md b/vllm/truss_server/README.md similarity index 100% rename from vllm/vllm_truss/README.md rename to vllm/truss_server/README.md diff --git a/vllm/vllm_truss/config.yaml b/vllm/truss_server/config.yaml similarity index 100% rename from vllm/vllm_truss/config.yaml rename to vllm/truss_server/config.yaml diff --git a/vllm/vllm_truss/model/__init__.py b/vllm/truss_server/model/__init__.py similarity index 100% rename from vllm/vllm_truss/model/__init__.py rename to vllm/truss_server/model/__init__.py diff --git a/vllm/vllm_truss/model/helper.py b/vllm/truss_server/model/helper.py similarity index 100% rename from vllm/vllm_truss/model/helper.py rename to vllm/truss_server/model/helper.py diff --git a/vllm/vllm_truss/model/model.py b/vllm/truss_server/model/model.py similarity index 100% rename from vllm/vllm_truss/model/model.py rename to vllm/truss_server/model/model.py diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md new file mode 100644 index 000000000..f9263d54c --- /dev/null +++ b/vllm/vllm_server/README.md @@ -0,0 +1,28 @@ +# vLLM Truss to deploy chat completion model + +## What is this Truss example doing + +This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). + +## Configure your Truss by modifying the config.yaml + +### Basic options using 1 GPU + +To deploy a model using 1 GPU, the only config parameters you need to change are: +- `model_name` +- `repo_id` +- `accelerator` + +### Basic options using multiple GPUs + +If your model needs more than 1 GPU to run using tensor parallel, you will need to change `accelerator`, and to set `tensor_parallel_size` and `distributed_executor_backend` accordingly. + +`tensor_parallel_size` and `distributed_executor_backend` are each arguments for the vllm serve command in the `config.yaml`. + +If you are using 4 GPUs for inference for example, you need to add these arguments to the `vllm serve` command. + +`--tensor-parallel-size 4 --distributed-executor-backend mp` + +### Other ways to customize + +See this [doc](https://docs.vllm.ai/en/v0.7.2/serving/engine_args.html) for all the ways you can customize the `vllm serve` command. These parameters give you control over the level of compilation, quantization, and much more. \ No newline at end of file diff --git a/vllm/openai_compatible/config.yaml b/vllm/vllm_server/config.yaml similarity index 100% rename from vllm/openai_compatible/config.yaml rename to vllm/vllm_server/config.yaml From bdd6b2bc88be86bdfeb7b13577cb983686bf4038 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:33:13 -0400 Subject: [PATCH 03/18] Remove openAI compatible references for it is not entirely openAI compatible, and also deprecated --- vllm/truss_server/README.md | 8 +++----- vllm/truss_server/config.yaml | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 17fc3f903..803435131 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -2,7 +2,9 @@ ## What is this Truss example doing -This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). We create this example to give you the most codeless experience, so you can configure all vLLM engine parameters in `config.yaml`, without making code changes in `model.py` for most of the use cases. +Deploying vLLM using a truss server is only recommended for flexibility and custom inference logic. Otherwise, check out deploying using [vllm server](https://github.com/basetenlabs/truss-examples/tree/main/vllm/vllm_server), which is also OpenAI compatible. + +This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). ## Configure your Truss by modifying the config.yaml @@ -59,10 +61,6 @@ secrets: hf_access_token: null ``` -### Use vLLM's OpenAI compatible server - -To use vLLM in [OpenAI compatible server](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) mode, simply set `openai_compatible: true` under `model_metadata`. - ### Customize vLLM engine parameters For advanced users who want to override [vLLM engine arguments](https://docs.vllm.ai/en/latest/models/engine_args.html), you can add all arguments to `vllm_config` under `model_metadata`. diff --git a/vllm/truss_server/config.yaml b/vllm/truss_server/config.yaml index bd98b7e40..cd24c7f97 100644 --- a/vllm/truss_server/config.yaml +++ b/vllm/truss_server/config.yaml @@ -3,7 +3,6 @@ python_version: py311 model_metadata: example_model_input: {"prompt": "what is the meaning of life"} repo_id: meta-llama/Llama-3.1-8B-Instruct - openai_compatible: true vllm_config: tensor_parallel_size: 1 max_model_len: 4096 From 1bae519a4c216ae141e4c618c79e9d8551974cf2 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:36:27 -0400 Subject: [PATCH 04/18] Add back openai compatible tag --- vllm/truss_server/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/truss_server/config.yaml b/vllm/truss_server/config.yaml index cd24c7f97..bd98b7e40 100644 --- a/vllm/truss_server/config.yaml +++ b/vllm/truss_server/config.yaml @@ -3,6 +3,7 @@ python_version: py311 model_metadata: example_model_input: {"prompt": "what is the meaning of life"} repo_id: meta-llama/Llama-3.1-8B-Instruct + openai_compatible: true vllm_config: tensor_parallel_size: 1 max_model_len: 4096 From aec86c946ca9c45f43b80f2c6683123b3fae77ea Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:38:44 -0400 Subject: [PATCH 05/18] Stress the recommended option more and fix typo --- vllm/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/README.md b/vllm/README.md index f1b3b1830..031713ec1 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -6,8 +6,8 @@ This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an ## Two options -### Vllm server via vllm serve -This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. See the +### Vllm server via vllm serve (Recommended) +This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. ### Vllm using truss server This solution is for a more custom deployment where you require flexibility such as custom logic in your predictions. \ No newline at end of file From 8be8691c730453b00af621f397a9f072e49ad8aa Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:40:16 -0400 Subject: [PATCH 06/18] Explanation of truss server openai compatibility --- vllm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/README.md b/vllm/README.md index 031713ec1..b049c86fd 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -10,4 +10,4 @@ This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. ### Vllm using truss server -This solution is for a more custom deployment where you require flexibility such as custom logic in your predictions. \ No newline at end of file +This solution is for a more custom deployment where you require flexibility such as custom logic in your predictions. This is OpenAI compatible with a few modifications in your calling code. \ No newline at end of file From eb6a7766e548fe742842d232bb2b7937af6e64a0 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:44:12 -0400 Subject: [PATCH 07/18] Further explaining multi-gpu inference --- vllm/vllm_server/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index f9263d54c..e4ae890bb 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -19,10 +19,8 @@ If your model needs more than 1 GPU to run using tensor parallel, you will need `tensor_parallel_size` and `distributed_executor_backend` are each arguments for the vllm serve command in the `config.yaml`. -If you are using 4 GPUs for inference for example, you need to add these arguments to the `vllm serve` command. +If you are using 4 GPUs for inference for example, you need to add the arguments `--tensor-parallel-size 4 --distributed-executor-backend mp` to the `vllm serve` command as well as indicating this new quantity under `accelerator: H100:4`. -`--tensor-parallel-size 4 --distributed-executor-backend mp` - -### Other ways to customize +### Customize the vLLM server See this [doc](https://docs.vllm.ai/en/v0.7.2/serving/engine_args.html) for all the ways you can customize the `vllm serve` command. These parameters give you control over the level of compilation, quantization, and much more. \ No newline at end of file From 2aab9b1fe6bfcbc82462bf08a067e5bfd1647d43 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:46:32 -0400 Subject: [PATCH 08/18] Better explanation of vllm server truss --- vllm/vllm_server/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index e4ae890bb..6da81acf9 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -23,4 +23,6 @@ If you are using 4 GPUs for inference for example, you need to add the arguments ### Customize the vLLM server +This container starts by calling the `vllm serve` command under `start_command` in `config.yaml`. + See this [doc](https://docs.vllm.ai/en/v0.7.2/serving/engine_args.html) for all the ways you can customize the `vllm serve` command. These parameters give you control over the level of compilation, quantization, and much more. \ No newline at end of file From 12ee0f31c787590003a8f3cc3454bfe56b150d40 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 15:50:31 -0400 Subject: [PATCH 09/18] Update documentation links --- vllm/README.md | 8 ++------ vllm/truss_server/README.md | 2 +- vllm/vllm_server/README.md | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/README.md b/vllm/README.md index b049c86fd..c010f3a3c 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -1,13 +1,9 @@ # vLLM Truss to deploy chat completion model -## What is this Truss example doing - -This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). - ## Two options ### Vllm server via vllm serve (Recommended) -This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. +This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. See `vllm_server` directory for the truss. ### Vllm using truss server -This solution is for a more custom deployment where you require flexibility such as custom logic in your predictions. This is OpenAI compatible with a few modifications in your calling code. \ No newline at end of file +This solution is for a custom deployment where you require flexibility such as custom inference logic. This is OpenAI compatible with a few modifications in your calling code. See `truss_server` directory for the truss. \ No newline at end of file diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 803435131..beb59b6fa 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -4,7 +4,7 @@ Deploying vLLM using a truss server is only recommended for flexibility and custom inference logic. Otherwise, check out deploying using [vllm server](https://github.com/basetenlabs/truss-examples/tree/main/vllm/vllm_server), which is also OpenAI compatible. -This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). +This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/v0.6.5/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). ## Configure your Truss by modifying the config.yaml diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index 6da81acf9..6a7aaac02 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -2,7 +2,7 @@ ## What is this Truss example doing -This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). +This is a codeless, easy OpenAI compatible solution to run a vllm server in a truss. Run a vllm server simply by modifying configurations, we'll handle the rest. ## Configure your Truss by modifying the config.yaml From 43132d95b112433b54564d565ea29387a78bfd71 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:13:46 -0400 Subject: [PATCH 10/18] Improve styling and presentation of markup --- vllm/README.md | 36 ++++++++-- vllm/truss_server/README.md | 136 ++++++++++++++++++++---------------- vllm/vllm_server/README.md | 36 ++++++---- 3 files changed, 128 insertions(+), 80 deletions(-) diff --git a/vllm/README.md b/vllm/README.md index c010f3a3c..57a930059 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -1,9 +1,33 @@ -# vLLM Truss to deploy chat completion model +# Deploying a Chat Completion Model with vLLM Truss -## Two options +This repository provides two approaches for deploying OpenAI-compatible chat completion models using vLLM and Truss. Select the option that best suits your use case. -### Vllm server via vllm serve (Recommended) -This is an openai-compatible codeless solution that the large majority of users should use when deploying with vLLM. The only work required to set up this vLLM server is to modify the configs in the vllm serve command in `config.yaml`. See `vllm_server` directory for the truss. +--- -### Vllm using truss server -This solution is for a custom deployment where you require flexibility such as custom inference logic. This is OpenAI compatible with a few modifications in your calling code. See `truss_server` directory for the truss. \ No newline at end of file +## Deployment Options + +### 1. **vLLM Server via `vllm serve` (Strongly Recommended)** + +**Overview:** +Leverage the built-in vLLM server for an OpenAI-compatible, codeless deployment. This is the recommended method for most users who want a fast, production-ready setup. + +**How to Use:**= +- See the [`vllm_server`](./vllm_server) directory for more details and instructions. + +**Why use this?** +- Minimal setup, codeless solution +- OpenAI-compatible + +--- + +### 2. **vLLM with Truss Server** + +**Overview:** +For advanced users who need custom inference logic, additional pre/post-processing, or further flexibility. + +**How to Use:** +- Refer to the [`truss_server`](./truss_server) directory for details and configuration examples. + +**Why use this?** +- Fully customizable inference and server logic. +- OpenAI-compatible with minimal client changes. diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index beb59b6fa..20c8b8f6c 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -1,22 +1,28 @@ -# vLLM Truss to deploy chat completion model +# vLLM Truss: Deploy Chat Completion Models -## What is this Truss example doing +## Overview -Deploying vLLM using a truss server is only recommended for flexibility and custom inference logic. Otherwise, check out deploying using [vllm server](https://github.com/basetenlabs/truss-examples/tree/main/vllm/vllm_server), which is also OpenAI compatible. +This repository demonstrates how to deploy [vLLM](https://github.com/vllm-project/vllm) using a Truss server. +**Use this approach only if you need custom inference logic or flexibility.** +For most users, we recommend the easier [vLLM server example](https://github.com/basetenlabs/truss-examples/tree/main/vllm/vllm_server), which is also OpenAI-compatible. -This is a general purpose [Truss](https://truss.baseten.co/) that can deploy an asynchronous vLLM engine([AsyncLLMEngine](https://docs.vllm.ai/en/v0.6.5/dev/engine/async_llm_engine.html#asyncllmengine)) of any customized configuration with [all compatible chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). +This Truss works with asynchronous vLLM engines ([AsyncLLMEngine](https://docs.vllm.ai/en/v0.6.5/dev/engine/async_llm_engine.html#asyncllmengine)) and [all supported chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). -## Configure your Truss by modifying the config.yaml +--- -### Basic options using 1 GPU +## Configure Your Truss (`config.yaml`) -Here is the minimum config file you will need to deploy a model using vLLM on 1 GPU. -The only parameters you need to touch are: +### Single GPU Example + +To deploy on a single GPU, update these fields: - `model_name` - `repo_id` - `accelerator` -``` +
+Minimal config example + +```yaml model_name: "Llama 3.1 8B Instruct VLLM" python_version: py311 model_metadata: @@ -34,12 +40,21 @@ runtime: secrets: hf_access_token: null ``` +
-### Basic options using multiple GPUs +--- -If your model needs more than 1 GPU to run using tensor parallel, you will need to change `accelerator`, and to set `tensor_parallel_size` and `distributed_executor_backend` accordingly. +### Multi-GPU Example (Tensor Parallelism) -``` +For multi-GPU deployments, set: +- `accelerator` (e.g., `A10G:4`) +- `model_metadata.vllm_config.tensor_parallel_size` +- `model_metadata.vllm_config.distributed_executor_backend` + +
+Multi-GPU config example + +```yaml model_name: "Llama 3.1 8B Instruct VLLM" python_version: py311 model_metadata: @@ -60,17 +75,18 @@ runtime: secrets: hf_access_token: null ``` +
-### Customize vLLM engine parameters +--- -For advanced users who want to override [vLLM engine arguments](https://docs.vllm.ai/en/latest/models/engine_args.html), you can add all arguments to `vllm_config` under `model_metadata`. +### Custom vLLM Engine Parameters -#### Example 1: using model quantization +Override any [vLLM engine argument](https://docs.vllm.ai/en/latest/models/engine_args.html) by adding it to `vllm_config` in `model_metadata`. -``` +#### Example: Model Quantization + +```yaml model_name: Mistral 7B v2 vLLM AWQ - T4 -environment_variables: {} -external_package_dirs: [] model_metadata: repo_id: TheBloke/Mistral-7B-Instruct-v0.2-AWQ vllm_config: @@ -84,18 +100,15 @@ requirements: resources: accelerator: T4 use_gpu: true -secrets: - hf_access_token: null -system_packages: [] runtime: predict_concurrency: 128 +secrets: + hf_access_token: null ``` -#### Example 2: using customized vLLM image +#### Example: Custom Docker Image -You can even override with your own customized vLLM docker image to work with models that are not supported yet by vanilla vLLM. - -``` +```yaml model_name: Ultravox v0.2 base_image: image: vshulman/vllm-openai-fixie:latest @@ -104,54 +117,54 @@ model_metadata: repo_id: fixie-ai/ultravox-v0.2 vllm_config: audio_token_id: 128002 -environment_variables: {} -external_package_dirs: [] python_version: py310 -runtime: - predict_concurrency: 512 requirements: - httpx resources: accelerator: A100 use_gpu: true +runtime: + predict_concurrency: 512 secrets: hf_access_token: null system_packages: -- python3.10-venv + - python3.10-venv ``` -## Deploy your Truss - -1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). -2. Install the latest version of Truss: `pip install --upgrade truss` -3. With `vllm` as your working directory, you can deploy the model with: +--- - ```sh - truss push --trusted - ``` +## Deploy Your Truss - Paste your Baseten API key if prompted. +1. [Sign up for Baseten](https://app.baseten.co/signup) and get an [API key](https://app.baseten.co/settings/account/api_keys). +2. Install Truss: + ```sh + pip install --upgrade truss + ``` +3. Deploy your model from the `vllm` directory: + ```sh + truss push --trusted + ``` + Enter your API key when prompted. -For more information, see [Truss documentation](https://truss.baseten.co). +[Truss documentation →](https://truss.baseten.co) -## Call your model +--- -Once your deployment is up, there are [many ways](https://docs.baseten.co/invoke/quickstart) to call your model. +## Call Your Model -### curl command +After deploying, invoke your model via [many methods](https://docs.baseten.co/invoke/quickstart). -#### If you are NOT using OpenAI compatible server +### Curl: Not OpenAI Compatible -``` +```sh curl -X POST https://model-.api.baseten.co/development/predict \ -H "Authorization: Api-Key $BASETEN_API_KEY" \ -d '{"prompt": "what is the meaning of life"}' ``` +### Curl: OpenAI Compatible -#### If you are using OpenAI compatible server - -``` +```sh curl -X POST "https://model-.api.baseten.co/development/predict" \ -H "Content-Type: application/json" \ -H 'Authorization: Api-Key {BASETEN_API_KEY}' \ @@ -161,25 +174,26 @@ curl -X POST "https://model-.api.baseten.co/development/predict" }' ``` -To access [production metrics](https://docs.vllm.ai/en/latest/serving/metrics.html) reported by OpenAI compatible server, simply add `metrics: true` to the request. +**Production Metrics:** +Add `"metrics": true` to your request for detailed metrics: -``` +```sh curl -X POST "https://model-.api.baseten.co/development/predict" \ -H "Content-Type: application/json" \ -H 'Authorization: Api-Key {BASETEN_API_KEY}' \ - -d '{ - "metrics": true - }' + -d '{"metrics": true}' ``` -### OpenAI SDK (if you are using OpenAI compatible server) +--- -``` +### OpenAI SDK (OpenAI-Compatible Only) + +```python from openai import OpenAI import os -model_id = "abcd1234" # Replace with your model ID -deployment_id = "4321cbda" # [Optional] Replace with your deployment ID +model_id = "abcd1234" # Replace with your model ID +deployment_id = "4321cbda" # [Optional] client = OpenAI( api_key=os.environ["BASETEN_API_KEY"], @@ -201,11 +215,13 @@ response = client.chat.completions.create( } ) print(response.choices[0].message.content) - ``` -For more information, see [API reference](https://docs.baseten.co/api-reference/openai). +[API Reference →](https://docs.baseten.co/api-reference/openai) + +--- ## Support -If you have any questions or need assistance, please open an issue in this repository or contact our support team. +Need help? +Open an issue in this repository or [contact Baseten support](https://www.baseten.co/contact). diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index 6a7aaac02..215825e1f 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -1,28 +1,36 @@ -# vLLM Truss to deploy chat completion model +# vLLM Truss: Deploy a Chat Completion Model -## What is this Truss example doing +## Overview -This is a codeless, easy OpenAI compatible solution to run a vllm server in a truss. Run a vllm server simply by modifying configurations, we'll handle the rest. +This Truss example offers a **codeless, OpenAI-compatible solution** to run a vLLM server within a Truss container. With minimal configuration, you can deploy powerful language models on our cloud—just update your settings and Truss will handle the rest. -## Configure your Truss by modifying the config.yaml +--- -### Basic options using 1 GPU +## Configuration Guide -To deploy a model using 1 GPU, the only config parameters you need to change are: +All deployment options are controlled via the `config.yaml` file. Follow the instructions below based on your GPU requirements: + +### 🚀 Basic: Single GPU Deployment + +To deploy a model using a single GPU, simply modify the following parameters in `config.yaml`: - `model_name` - `repo_id` - `accelerator` -### Basic options using multiple GPUs - -If your model needs more than 1 GPU to run using tensor parallel, you will need to change `accelerator`, and to set `tensor_parallel_size` and `distributed_executor_backend` accordingly. +No additional changes are required. -`tensor_parallel_size` and `distributed_executor_backend` are each arguments for the vllm serve command in the `config.yaml`. +--- -If you are using 4 GPUs for inference for example, you need to add the arguments `--tensor-parallel-size 4 --distributed-executor-backend mp` to the `vllm serve` command as well as indicating this new quantity under `accelerator: H100:4`. +### 🖥️ Advanced: Multi-GPU Deployment (Tensor Parallelism) -### Customize the vLLM server +If your model requires multiple GPUs, such as for tensor parallelism, you’ll need to configure: -This container starts by calling the `vllm serve` command under `start_command` in `config.yaml`. +- `accelerator` + Example for 4 H100 GPUs: + ```yaml + accelerator: H100:4 + ``` +- tensor_parallel_size +- distributed_executor_backend -See this [doc](https://docs.vllm.ai/en/v0.7.2/serving/engine_args.html) for all the ways you can customize the `vllm serve` command. These parameters give you control over the level of compilation, quantization, and much more. \ No newline at end of file +These last two are arguments for the `vllm serve` command within `config.yaml`. Add to the command as follows: `--tensor-parallel-size 4 --distributed-executor-backend mp` \ No newline at end of file From a4b0f2c8969f1d8b6adaf598ffabb6396c28ffb8 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:23:07 -0400 Subject: [PATCH 11/18] Improve styling bugs --- vllm/README.md | 6 +++--- vllm/truss_server/README.md | 10 +++++++++- vllm/vllm_server/README.md | 4 ++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/vllm/README.md b/vllm/README.md index 57a930059..2c7ea4277 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -11,7 +11,7 @@ This repository provides two approaches for deploying OpenAI-compatible chat com **Overview:** Leverage the built-in vLLM server for an OpenAI-compatible, codeless deployment. This is the recommended method for most users who want a fast, production-ready setup. -**How to Use:**= +**How to Use:** - See the [`vllm_server`](./vllm_server) directory for more details and instructions. **Why use this?** @@ -29,5 +29,5 @@ For advanced users who need custom inference logic, additional pre/post-processi - Refer to the [`truss_server`](./truss_server) directory for details and configuration examples. **Why use this?** -- Fully customizable inference and server logic. -- OpenAI-compatible with minimal client changes. +- Fully customizable inference and server logic +- OpenAI-compatible with minimal client changes diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 20c8b8f6c..4643cc8f5 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -85,6 +85,9 @@ Override any [vLLM engine argument](https://docs.vllm.ai/en/latest/models/engine #### Example: Model Quantization +
+Example + ```yaml model_name: Mistral 7B v2 vLLM AWQ - T4 model_metadata: @@ -105,9 +108,13 @@ runtime: secrets: hf_access_token: null ``` +
#### Example: Custom Docker Image +
+Example + ```yaml model_name: Ultravox v0.2 base_image: @@ -130,6 +137,7 @@ secrets: system_packages: - python3.10-venv ``` +
--- @@ -142,7 +150,7 @@ system_packages: ``` 3. Deploy your model from the `vllm` directory: ```sh - truss push --trusted + truss push ``` Enter your API key when prompted. diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index 215825e1f..fc6787dc8 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -30,7 +30,7 @@ If your model requires multiple GPUs, such as for tensor parallelism, you’ll n ```yaml accelerator: H100:4 ``` -- tensor_parallel_size -- distributed_executor_backend +- `tensor_parallel_size` +- `distributed_executor_backend` These last two are arguments for the `vllm serve` command within `config.yaml`. Add to the command as follows: `--tensor-parallel-size 4 --distributed-executor-backend mp` \ No newline at end of file From c050854b6e61f7c179eb3c14dba42f4b40868b28 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:24:15 -0400 Subject: [PATCH 12/18] Improve dropdown styling --- vllm/truss_server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 4643cc8f5..decdd0a69 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -86,7 +86,7 @@ Override any [vLLM engine argument](https://docs.vllm.ai/en/latest/models/engine #### Example: Model Quantization
-Example + ```yaml model_name: Mistral 7B v2 vLLM AWQ - T4 @@ -113,7 +113,7 @@ secrets: #### Example: Custom Docker Image
-Example + ```yaml model_name: Ultravox v0.2 From 85d198544f7410709925b3bcef9e9d90b1696f69 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:26:11 -0400 Subject: [PATCH 13/18] Improve styling --- vllm/truss_server/README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index decdd0a69..a2799e48e 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -83,10 +83,8 @@ secrets: Override any [vLLM engine argument](https://docs.vllm.ai/en/latest/models/engine_args.html) by adding it to `vllm_config` in `model_metadata`. -#### Example: Model Quantization -
- +Example: Model Quantization ```yaml model_name: Mistral 7B v2 vLLM AWQ - T4 @@ -110,10 +108,8 @@ secrets: ```
-#### Example: Custom Docker Image -
- +Example: Custom Docker Image ```yaml model_name: Ultravox v0.2 From 4cfcdc1077b4632af54072720192c240792fda45 Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:28:44 -0400 Subject: [PATCH 14/18] Add more explanation to examples --- vllm/truss_server/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index a2799e48e..3ab455213 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -79,7 +79,7 @@ secrets: --- -### Custom vLLM Engine Parameters +### Customization Override any [vLLM engine argument](https://docs.vllm.ai/en/latest/models/engine_args.html) by adding it to `vllm_config` in `model_metadata`. @@ -108,6 +108,9 @@ secrets: ```
+ +You can even override with your own customized vLLM docker image to work with models that are not supported yet by vanilla vLLM. +
Example: Custom Docker Image From 656cea22d623ea1e193fc4610d2766b9d8aad63f Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:30:53 -0400 Subject: [PATCH 15/18] Improve support link --- vllm/truss_server/README.md | 3 +-- vllm/vllm_server/README.md | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 3ab455213..706563176 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -230,5 +230,4 @@ print(response.choices[0].message.content) ## Support -Need help? -Open an issue in this repository or [contact Baseten support](https://www.baseten.co/contact). +Need help? [contact Baseten support](https://www.baseten.co/talk-to-us/). diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index fc6787dc8..b6e9f8777 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -33,4 +33,8 @@ If your model requires multiple GPUs, such as for tensor parallelism, you’ll n - `tensor_parallel_size` - `distributed_executor_backend` -These last two are arguments for the `vllm serve` command within `config.yaml`. Add to the command as follows: `--tensor-parallel-size 4 --distributed-executor-backend mp` \ No newline at end of file +These last two are arguments for the `vllm serve` command within `config.yaml`. Add to the command as follows: `--tensor-parallel-size 4 --distributed-executor-backend mp` + +## Support + +Need help? [contact Baseten support](https://www.baseten.co/talk-to-us/). \ No newline at end of file From f95c13a97b3e127b4d97f7c65da262241b7d00ed Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 17:35:20 -0400 Subject: [PATCH 16/18] Remove old links --- vllm/truss_server/README.md | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 706563176..16235ec5e 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -142,24 +142,22 @@ system_packages: ## Deploy Your Truss -1. [Sign up for Baseten](https://app.baseten.co/signup) and get an [API key](https://app.baseten.co/settings/account/api_keys). -2. Install Truss: - ```sh - pip install --upgrade truss - ``` -3. Deploy your model from the `vllm` directory: - ```sh - truss push - ``` - Enter your API key when prompted. - -[Truss documentation →](https://truss.baseten.co) +First [sign up for Baseten](https://app.baseten.co/signup) and get an [API key](https://app.baseten.co/settings/account/api_keys). + +```sh +# Install truss +pip install --upgrade truss + +# Deploy your model from the `vllm` directory +truss push + +``` --- ## Call Your Model -After deploying, invoke your model via [many methods](https://docs.baseten.co/invoke/quickstart). +After deploying, invoke your model. ### Curl: Not OpenAI Compatible @@ -224,8 +222,6 @@ response = client.chat.completions.create( print(response.choices[0].message.content) ``` -[API Reference →](https://docs.baseten.co/api-reference/openai) - --- ## Support From 68c81447a2279eb5de75be608d7728763e1623bc Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Fri, 25 Jul 2025 18:33:59 -0400 Subject: [PATCH 17/18] Remove trailing whitespace --- vllm/README.md | 4 ++-- vllm/truss_server/README.md | 6 +++--- vllm/vllm_server/README.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/README.md b/vllm/README.md index 2c7ea4277..d1fc66528 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -8,7 +8,7 @@ This repository provides two approaches for deploying OpenAI-compatible chat com ### 1. **vLLM Server via `vllm serve` (Strongly Recommended)** -**Overview:** +**Overview:** Leverage the built-in vLLM server for an OpenAI-compatible, codeless deployment. This is the recommended method for most users who want a fast, production-ready setup. **How to Use:** @@ -22,7 +22,7 @@ Leverage the built-in vLLM server for an OpenAI-compatible, codeless deployment. ### 2. **vLLM with Truss Server** -**Overview:** +**Overview:** For advanced users who need custom inference logic, additional pre/post-processing, or further flexibility. **How to Use:** diff --git a/vllm/truss_server/README.md b/vllm/truss_server/README.md index 16235ec5e..ffc67c820 100644 --- a/vllm/truss_server/README.md +++ b/vllm/truss_server/README.md @@ -2,8 +2,8 @@ ## Overview -This repository demonstrates how to deploy [vLLM](https://github.com/vllm-project/vllm) using a Truss server. -**Use this approach only if you need custom inference logic or flexibility.** +This repository demonstrates how to deploy [vLLM](https://github.com/vllm-project/vllm) using a Truss server. +**Use this approach only if you need custom inference logic or flexibility.** For most users, we recommend the easier [vLLM server example](https://github.com/basetenlabs/truss-examples/tree/main/vllm/vllm_server), which is also OpenAI-compatible. This Truss works with asynchronous vLLM engines ([AsyncLLMEngine](https://docs.vllm.ai/en/v0.6.5/dev/engine/async_llm_engine.html#asyncllmengine)) and [all supported chat completion models](https://docs.vllm.ai/en/latest/models/supported_models.html). @@ -179,7 +179,7 @@ curl -X POST "https://model-.api.baseten.co/development/predict" }' ``` -**Production Metrics:** +**Production Metrics:** Add `"metrics": true` to your request for detailed metrics: ```sh diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index b6e9f8777..211dc947f 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -25,8 +25,8 @@ No additional changes are required. If your model requires multiple GPUs, such as for tensor parallelism, you’ll need to configure: -- `accelerator` - Example for 4 H100 GPUs: +- `accelerator` + Example for 4 H100 GPUs: ```yaml accelerator: H100:4 ``` From c7f04cbfe0fca95859420da0bc0568d9a6a4eebb Mon Sep 17 00:00:00 2001 From: Fred Liu Date: Mon, 28 Jul 2025 12:29:31 -0400 Subject: [PATCH 18/18] Fix end of files --- vllm/vllm_server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/vllm_server/README.md b/vllm/vllm_server/README.md index 211dc947f..ff21ca3ab 100644 --- a/vllm/vllm_server/README.md +++ b/vllm/vllm_server/README.md @@ -37,4 +37,4 @@ These last two are arguments for the `vllm serve` command within `config.yaml`. ## Support -Need help? [contact Baseten support](https://www.baseten.co/talk-to-us/). \ No newline at end of file +Need help? [contact Baseten support](https://www.baseten.co/talk-to-us/).