File tree Expand file tree Collapse file tree 10 files changed +912
-19
lines changed
inference/a4x/single-host-serving/tensorrt-llm
frameworks/a4x/trtllm-configs
a3ultra/inference-templates/deployment/templates
a4x/inference-templates/deployment Expand file tree Collapse file tree 10 files changed +912
-19
lines changed Load Diff Large diffs are not rendered by default.
Original file line number Diff line number Diff line change 1+ # Copyright 2025 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ queue :
16+
17+ dwsSettings :
18+ maxRunDurationSeconds :
19+
20+ huggingface :
21+ secretName : hf-secret
22+ secretData :
23+ token : " hf_api_token"
24+
25+ volumes :
26+ gcsVolumes : true
27+ ssdMountPath : " /ssd"
28+ gcsMounts :
29+ - bucketName :
30+ mountPath : " /gcs"
31+
32+ service :
33+ type : ClusterIP
34+ ports :
35+ http : 8000
36+
37+ workload :
38+ model :
39+ name :
40+ gpus : 4
41+ image :
42+ framework :
43+ configFile : serving-args.yaml
44+ configPath : /workload/configs
45+ envs :
46+ - name : HF_HUB_ENABLE_HF_TRANSFER
47+ value : " 1"
48+ - name : LAUNCHER_SCRIPT
49+ value : " /workload/launcher/launch-workload.sh"
50+ - name : SERVER_ARGS_FILE
51+ value : " /workload/configs/serving-args.yaml"
52+ benchmarks :
53+ experiments :
54+ - isl : 128
55+ osl : 128
56+ num_requests : 1000
57+
58+ network :
59+ subnetworks[] :
60+ gibVersion : us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7
61+ ncclSettings :
62+ - name : NCCL_DEBUG
63+ value : " VERSION"
Original file line number Diff line number Diff line change 1+ tp_size : 4
2+ ep_size : 4
3+ pp_size : 1
4+ backend : pytorch
5+ kv_cache_free_gpu_mem_fraction : 0.85
6+ llm_api_args :
7+ cuda_graph_config :
8+ batch_sizes :
9+ - 1
10+ - 2
11+ - 4
12+ - 8
13+ - 16
14+ - 20
15+ - 24
16+ - 32
17+ - 64
18+ - 96
19+ - 128
20+ - 160
21+ - 192
22+ - 256
23+ - 320
24+ - 384
25+ - 512
26+ enable_padding : true
27+ enable_attention_dp : true
28+ enable_chunked_prefill : true
29+ kv_cache_config :
30+ dtype : auto
31+ enable_block_reuse : false
32+ free_gpu_memory_fraction : 0.85
33+ moe_config :
34+ backend : CUTLASS
35+ print_iter_log : true
Original file line number Diff line number Diff line change @@ -182,6 +182,9 @@ spec:
182182 value : " {{ $root.Values.workload.model.name }}"
183183 - name : MODEL_DOWNLOAD_DIR
184184 value : " /ssd/{{ $root.Values.workload.model.name }}"
185+ # A3-Ultra recipe is based on the TensorRT image, which puts tensorrt_llm in a different path than default
186+ - name : TRTLLM_DIR
187+ value : " /workspace/tensorrtllm_backend/tensorrt_llm"
185188 {{- if $root.Values.workload.envs }}
186189 {{- toYaml .Values.workload.envs | nindent 12 }}
187190 {{- end }}
Original file line number Diff line number Diff line change 1+ # Copyright 2025 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : v2
16+ name : single-host-serving-deployment-template
17+ description : single-host-serving-deployment-template
18+ type : application
19+ version : 0.1.0
20+ appVersion : " 1.16.0"
Original file line number Diff line number Diff line change 1+ # Copyright 2025 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : v1
16+ kind : ConfigMap
17+ metadata :
18+ name : " {{ .Release.Name }}-config"
19+ data :
20+ serving-configuration : |-
21+ {{- if .Values.serving_config }}
22+ {{ .Values.serving_config | nindent 4 }}
23+ {{- else }}
24+ {{ "config: null" | nindent 4 }}
25+ {{- end }}
Original file line number Diff line number Diff line change 1+ # Copyright 2025 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : v1
16+ kind : ConfigMap
17+ metadata :
18+ name : " {{ .Release.Name }}-launcher"
19+ data :
20+ launch-workload.sh : |-
21+ {{- if .Values.workload_launcher }}
22+ {{ .Values.workload_launcher | nindent 4 }}
23+ {{- else }}
24+ # !/bin/bash
25+ echo "No workload launcher specified"
26+ exit 1
27+ {{- end }}
You can’t perform that action at this time.
0 commit comments