Skip to content

Commit 27e2e15

Browse files
authored
Merge pull request #74 from Abhishekbhagwat/main
feat: Llama-4-Maverick-16B-128E multihost serving recipe
2 parents 1869642 + 12a7e12 commit 27e2e15

File tree

10 files changed

+841
-2
lines changed

10 files changed

+841
-2
lines changed

inference/trillium/JetStream-Maxtext/Llama-4-Maverick-17B-128E/README.md

Lines changed: 407 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM ubuntu:22.04
16+
17+
ENV DEBIAN_FRONTEND=noninteractive
18+
19+
RUN apt -y update && apt install -y --no-install-recommends apt-transport-https ca-certificates gnupg git python3.10 python3-pip curl nano vim
20+
21+
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
22+
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && apt-get update -y && apt-get install google-cloud-sdk -y
23+
24+
RUN python3 -m pip install --upgrade pip
25+
26+
ENV JAX_PLATFORMS=proxy
27+
ENV JAX_BACKEND_TARGET=grpc://localhost:38681
28+
ENV XCLOUD_ENVIRONMENT=GCP
29+
30+
ENV MAXTEXT_VERSION=main
31+
ENV JETSTREAM_VERSION=main
32+
33+
RUN git clone https://github.com/AI-Hypercomputer/JetStream.git && \
34+
git clone https://github.com/AI-Hypercomputer/maxtext.git
35+
36+
RUN cd maxtext/ && \
37+
git checkout ${MAXTEXT_VERSION} && \
38+
bash setup.sh
39+
40+
RUN cd /JetStream && \
41+
git checkout ${JETSTREAM_VERSION} && \
42+
pip install -e .
43+
44+
RUN pip install setuptools fastapi uvicorn
45+
46+
RUN apt -y update && apt-get -y install python3-dev && apt-get -y install build-essential
47+
48+
ENTRYPOINT [ "/bin/bash" ]
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
steps:
16+
- name: 'gcr.io/cloud-builders/docker'
17+
args:
18+
- 'build'
19+
- '--tag=${_ARTIFACT_REGISTRY}/${_JETSTREAM_MAXTEXT_IMAGE}:${_JETSTREAM_MAXTEXT_VERSION}'
20+
- '--file=Dockerfile'
21+
- '.'
22+
automapSubstitutions: true
23+
24+
images:
25+
- '${_ARTIFACT_REGISTRY}/${_JETSTREAM_MAXTEXT_IMAGE}:${_JETSTREAM_MAXTEXT_VERSION}'
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v2
16+
name: trillium-maxtext-jetstream-llama-serve-model
17+
description: trillium-maxtext-jetstream-llama-serve-model
18+
type: application
19+
version: 0.1.0
20+
appVersion: "1.16.0"
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
{{- if .Values.convert_hf_ckpt }}
2+
{{- $root := . }}
3+
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: {{ .Release.Name }}-convert-ckpt
8+
labels:
9+
app: {{ .Release.Name }}-convert-ckpt
10+
app.kubernetes.io/instance: {{ .Release.Name }}
11+
app.kubernetes.io/managed-by: {{ .Release.Service }}
12+
spec:
13+
template:
14+
metadata:
15+
labels:
16+
app: {{ .Release.Name }}-convert-ckpt
17+
app.kubernetes.io/instance: {{ .Release.Name }}
18+
spec:
19+
restartPolicy: OnFailure
20+
affinity:
21+
nodeAffinity:
22+
requiredDuringSchedulingIgnoredDuringExecution:
23+
nodeSelectorTerms:
24+
- matchExpressions:
25+
- key: cloud.google.com/machine-family
26+
operator: In
27+
values:
28+
- n2
29+
- key: node.kubernetes.io/instance-type
30+
operator: In
31+
values:
32+
- n2-highmem-80
33+
34+
volumes:
35+
- name: shared-memory
36+
emptyDir:
37+
medium: "Memory"
38+
sizeLimit: 50Gi
39+
- name: local-ssd
40+
hostPath:
41+
path: /mnt/stateful_partition/kube-ephemeral-ssd
42+
43+
containers:
44+
- name: convert-model
45+
image: "{{ .Values.job.image.repository }}:{{ .Values.job.image.tag }}"
46+
imagePullPolicy: Always
47+
securityContext:
48+
privileged: true
49+
50+
env:
51+
- name: HF_TOKEN
52+
valueFrom:
53+
secretKeyRef:
54+
name: "{{ .Values.huggingface.secretName }}"
55+
key: "{{ .Values.huggingface.secretData.token }}"
56+
- name: BASE_MODEL_PATH
57+
value: "/ssd/{{ .Values.model.name }}"
58+
- name: CHECKPOINT_TPU_UNSCANNED
59+
value: "/ssd/{{ .Values.model.name }}/output/unscanned_ckpt/checkpoints"
60+
- name: MODEL_SIZE
61+
value: "llama4-17b-128e"
62+
{{- range $gcs := $root.Values.volumes.gcsMounts }}
63+
- name: GCS_FUSE_BUCKET
64+
value: "{{ $gcs.bucketName }}"
65+
{{- end }}
66+
67+
workingDir: /workspace
68+
command: ["/bin/bash", "-c"]
69+
args:
70+
- |
71+
set -eux
72+
73+
echo "Starting conversion job on a CPU node..."
74+
df -h
75+
76+
pip install torch --index-url https://download.pytorch.org/whl/cpu
77+
pip install huggingface_hub[hf_xet]
78+
79+
rm -rf /ssd/{{ .Values.model.name }}
80+
81+
82+
echo "Downloading HuggingFace model to ${BASE_MODEL_PATH}..."
83+
84+
huggingface-cli download {{ .Values.model.name }} --local-dir ${BASE_MODEL_PATH} --local-dir-use-symlinks False
85+
86+
echo "Starting GCSFuse Mount"
87+
mkdir -p /gcs
88+
gcsfuse --client-protocol http2 ${GCS_FUSE_BUCKET} /gcs
89+
90+
echo "Running MaxText checkpoint conversion (on CPU node)..."
91+
cd /maxtext || exit 1
92+
JAX_PLATFORMS=cpu python3 -m MaxText.llama4_ckpt_unscanned \
93+
--base-model-path ${BASE_MODEL_PATH} \
94+
--maxtext-model-path ${CHECKPOINT_TPU_UNSCANNED} \
95+
--model-size ${MODEL_SIZE} \
96+
--huggingface-checkpoint
97+
98+
gsutil -m cp -r ${CHECKPOINT_TPU_UNSCANNED} /gcs/{{ .Values.model.name }}/output/unscanned_ckpt/checkpoints/
99+
100+
echo "Conversion Job Complete. Unscanned checkpoints should be at ${CHECKPOINT_TPU_UNSCANNED}"
101+
102+
volumeMounts:
103+
- name: shared-memory
104+
mountPath: /dev/shm
105+
- name: local-ssd
106+
mountPath: {{ $root.Values.volumes.ssdMountPath }}
107+
backoffLimit: 1
108+
{{- end }}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v2
16+
name: trillium-maxtext-jetstream-llama-serve-model
17+
description: trillium-maxtext-jetstream-llama-serve-model
18+
type: application
19+
version: 0.1.0
20+
appVersion: "1.16.0"
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: "{{ .Release.Name }}"
19+
data:
20+
maxtext-configuration.yaml: |-
21+
{{- range $key, $value := .Values.maxtext_config }}
22+
{{ $key }}: {{ $value }}
23+
{{- end }}
24+
25+
libtpu-init-args: |-
26+
{{- range $key, $value := .Values.xla_flags }}
27+
--{{ $key }}={{ $value }}
28+
{{- end }}
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
{{- $root := . }}
15+
16+
apiVersion: pathways-job.pathways.domain/v1
17+
kind: PathwaysJob
18+
metadata:
19+
name: pathways
20+
spec:
21+
maxRestarts: 1
22+
workers:
23+
- type: ct6e-standard-4t
24+
topology: 8x8
25+
numSlices: 1
26+
pathwaysDir: "{{ .Values.pathwaysDir }}"
27+
controller:
28+
deploymentMode: colocate_head_with_workers
29+
mainContainerName: jetstream
30+
template:
31+
spec:
32+
volumes:
33+
- name: workload-configuration
34+
configMap:
35+
name: "{{.Release.Name}}"
36+
containers:
37+
- name: jetstream
38+
securityContext:
39+
privileged: true
40+
image: "{{ .Values.job.image.repository }}:{{ .Values.job.image.tag }}"
41+
command: ["/bin/bash", "-c"]
42+
args:
43+
- |
44+
set -eux
45+
echo "Starting model serving deployment on a TPU node..."
46+
47+
pip install torch --index-url https://download.pytorch.org/whl/cpu
48+
python3 -m nltk.downloader punkt_tab
49+
50+
# Parse server configurations from values file
51+
echo "MaxText configuration file:"
52+
if [ -f /etc/workload-configuration/maxtext-configuration.yaml ]; then
53+
sed 's/^/| /' /etc/workload-configuration/maxtext-configuration.yaml
54+
else
55+
echo "MaxText configuration file not found at /etc/workload-configuration/maxtext-configuration.yaml"
56+
fi
57+
echo ""
58+
59+
OPTIONS=()
60+
if [ -f /etc/workload-configuration/maxtext-configuration.yaml ]; then
61+
while IFS= read -r line || [[ -n "$line" ]]; do
62+
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
63+
key=$(echo "$line" | cut -d':' -f1 | tr -d '[:space:]')
64+
value=$(echo "$line" | cut -d':' -f2- | sed 's/^[[:space:]]*//')
65+
if [[ "$value" == \$* ]]; then
66+
var_name=${value#\$}
67+
if [[ -z "$var_name" ]]; then expanded_value="$"; else expanded_value="${!var_name:-$value}"; fi
68+
OPTIONS+=("$key=$expanded_value")
69+
else
70+
OPTIONS+=("$key=$value")
71+
fi
72+
done < /etc/workload-configuration/maxtext-configuration.yaml
73+
fi
74+
echo "===== MaxText Configuration Options ====="
75+
echo "${OPTIONS[@]}"
76+
77+
echo "Starting GCSFuse Mount"
78+
mkdir -p /gcs
79+
gcsfuse --client-protocol http2 ${GCS_FUSE_BUCKET} /gcs
80+
81+
# Start the JetStream MaxText server
82+
echo "Starting JetStream MaxText server on TPU node..."
83+
cd /maxtext || exit 1
84+
python3 -m MaxText.maxengine_server \
85+
/maxtext/MaxText/configs/v6e/inference/llama4_maverick_v6e-64.yml \
86+
"${OPTIONS[@]}"
87+
88+
imagePullPolicy: Always
89+
ports:
90+
- containerPort: 9000
91+
env:
92+
- name: ENABLE_PATHWAYS_PERSISTENCE
93+
value: "1"
94+
- name: HF_TOKEN
95+
valueFrom:
96+
secretKeyRef:
97+
name: "{{ .Values.huggingface.secretName }}"
98+
key: "{{ .Values.huggingface.secretData.token }}"
99+
- name: BASE_MODEL_PATH
100+
value: "/gcs/{{ .Values.model.name }}"
101+
- name: CHECKPOINT_TPU_UNSCANNED
102+
value: "/gcs/{{ .Values.model.name }}/output/unscanned_ckpt/checkpoints"
103+
- name: MODEL_SIZE
104+
value: "llama4-17b-128e"
105+
{{- range $gcs := $root.Values.volumes.gcsMounts }}
106+
- name: GCS_FUSE_BUCKET
107+
value: "{{ $gcs.bucketName }}"
108+
{{- end }}
109+
110+
volumeMounts:
111+
- name: workload-configuration
112+
mountPath: /etc/workload-configuration
113+
- name: jetstream-http
114+
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3
115+
imagePullPolicy: Always
116+
ports:
117+
- containerPort: 8000
118+

0 commit comments

Comments
 (0)