Skip to content

Commit 15a7ecb

Browse files
authored
add mithril h100 pool (#187)
Signed-off-by: simon-mo <[email protected]>
1 parent 2d821ae commit 15a7ecb

File tree

1 file changed

+35
-1
lines changed

1 file changed

+35
-1
lines changed

buildkite/test-template-ci.j2

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ agents:
102102
queue: cpu_queue_premerge_us_east_1
103103
{% elif step.gpu == "a100" %}
104104
queue: a100_queue
105+
{% elif step.gpu == "h100" %}
106+
queue: mithril-h100-pool
105107
{% elif step.gpu == "h200" %}
106108
queue: skylab-h200
107109
{% elif step.gpu == "b200" %}
@@ -130,7 +132,7 @@ retry:
130132

131133
{% if step.num_nodes < 2 %}
132134
plugins:
133-
{% if step.gpu != "a100" and step.gpu != "h200" and step.gpu != "b200" %}
135+
{% if step.gpu != "a100" and step.gpu != "h100" and step.gpu != "h200" and step.gpu != "b200" %}
134136
- docker#v5.2.0:
135137
image: {{ image }}
136138
always-pull: true
@@ -207,6 +209,38 @@ plugins:
207209
- /dev/shm:/dev/shm
208210
- /data/benchmark-hf-cache:/benchmark-hf-cache
209211
- /data/benchmark-vllm-cache:/root/.cache/vllm
212+
{% elif step.gpu == "h100" %}
213+
- kubernetes:
214+
podSpec:
215+
containers:
216+
- image: {{ image }}
217+
command:
218+
- bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}"
219+
resources:
220+
limits:
221+
nvidia.com/gpu: {{ step.num_gpus or 1 }}
222+
volumeMounts:
223+
- name: devshm
224+
mountPath: /dev/shm
225+
- name: hf-cache
226+
mountPath: {{ hf_home }}
227+
env:
228+
- name: VLLM_USAGE_SOURCE
229+
value: ci-test
230+
- name: NCCL_CUMEM_HOST_ENABLE
231+
value: "0"
232+
- name: HF_HOME
233+
value: {{ hf_home }}
234+
nodeSelector:
235+
nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3
236+
volumes:
237+
- name: devshm
238+
emptyDir:
239+
medium: Memory
240+
- name: hf-cache
241+
hostPath:
242+
path: /mnt/hf-cache
243+
type: Directory
210244
{% else %}
211245
- kubernetes:
212246
podSpec:

0 commit comments

Comments
 (0)