Skip to content

Commit 0017ec3

Browse files
authored
Adds a sample inferencing recipe for TRT-LLM on A4X (#50)
* WIP recipe for a4x trtllm inference * Plumbs through additional properties needed to support setting llm_api_args, customizing kvcache free mem, and adds an actual output example. * Delete lightweight test model config
1 parent 279f659 commit 0017ec3

File tree

10 files changed

+912
-19
lines changed

10 files changed

+912
-19
lines changed

inference/a4x/single-host-serving/tensorrt-llm/README.md

Lines changed: 384 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
queue:
16+
17+
dwsSettings:
18+
maxRunDurationSeconds:
19+
20+
huggingface:
21+
secretName: hf-secret
22+
secretData:
23+
token: "hf_api_token"
24+
25+
volumes:
26+
gcsVolumes: true
27+
ssdMountPath: "/ssd"
28+
gcsMounts:
29+
- bucketName:
30+
mountPath: "/gcs"
31+
32+
service:
33+
type: ClusterIP
34+
ports:
35+
http: 8000
36+
37+
workload:
38+
model:
39+
name:
40+
gpus: 4
41+
image:
42+
framework:
43+
configFile: serving-args.yaml
44+
configPath: /workload/configs
45+
envs:
46+
- name: HF_HUB_ENABLE_HF_TRANSFER
47+
value: "1"
48+
- name: LAUNCHER_SCRIPT
49+
value: "/workload/launcher/launch-workload.sh"
50+
- name: SERVER_ARGS_FILE
51+
value: "/workload/configs/serving-args.yaml"
52+
benchmarks:
53+
experiments:
54+
- isl: 128
55+
osl: 128
56+
num_requests: 1000
57+
58+
network:
59+
subnetworks[]:
60+
gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7
61+
ncclSettings:
62+
- name: NCCL_DEBUG
63+
value: "VERSION"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
tp_size: 4
2+
ep_size: 4
3+
pp_size: 1
4+
backend: pytorch
5+
kv_cache_free_gpu_mem_fraction: 0.85
6+
llm_api_args:
7+
cuda_graph_config:
8+
batch_sizes:
9+
- 1
10+
- 2
11+
- 4
12+
- 8
13+
- 16
14+
- 20
15+
- 24
16+
- 32
17+
- 64
18+
- 96
19+
- 128
20+
- 160
21+
- 192
22+
- 256
23+
- 320
24+
- 384
25+
- 512
26+
enable_padding: true
27+
enable_attention_dp: true
28+
enable_chunked_prefill: true
29+
kv_cache_config:
30+
dtype: auto
31+
enable_block_reuse: false
32+
free_gpu_memory_fraction: 0.85
33+
moe_config:
34+
backend: CUTLASS
35+
print_iter_log: true

src/helm-charts/a3ultra/inference-templates/deployment/templates/serving-launcher.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ spec:
182182
value: "{{ $root.Values.workload.model.name }}"
183183
- name: MODEL_DOWNLOAD_DIR
184184
value: "/ssd/{{ $root.Values.workload.model.name }}"
185+
# A3-Ultra recipe is based on the TensorRT image, which puts tensorrt_llm in a different path than default
186+
- name: TRTLLM_DIR
187+
value: "/workspace/tensorrtllm_backend/tensorrt_llm"
185188
{{- if $root.Values.workload.envs }}
186189
{{- toYaml .Values.workload.envs | nindent 12 }}
187190
{{- end }}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v2
16+
name: single-host-serving-deployment-template
17+
description: single-host-serving-deployment-template
18+
type: application
19+
version: 0.1.0
20+
appVersion: "1.16.0"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: "{{ .Release.Name }}-config"
19+
data:
20+
serving-configuration: |-
21+
{{- if .Values.serving_config }}
22+
{{ .Values.serving_config | nindent 4 }}
23+
{{- else }}
24+
{{ "config: null" | nindent 4 }}
25+
{{- end }}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: "{{ .Release.Name }}-launcher"
19+
data:
20+
launch-workload.sh: |-
21+
{{- if .Values.workload_launcher }}
22+
{{ .Values.workload_launcher | nindent 4 }}
23+
{{- else }}
24+
#!/bin/bash
25+
echo "No workload launcher specified"
26+
exit 1
27+
{{- end }}

0 commit comments

Comments
 (0)