Add TPU Trillium Multi-Host RayCluster for MaxText (#1807)

ryanaoleary · moficodes · web-flow · commit 0a556a677481 · 2025-10-29T17:48:52.000-04:00
* Add V6E multi-host RayCluster for MaxText

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

* add header

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

* Add Ray Train script

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

* update maxtext trainer script

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

* Add Dockerfile and region tags

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

* add license header and new line

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

---------

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;
Co-authored-by: Mofi Rahman &lt;mofi@google.com&gt;
diff --git a/ai-ml/gke-ray/raytrain/maxtext/Dockerfile b/ai-ml/gke-ray/raytrain/maxtext/Dockerfile
@@ -0,0 +1,44 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_ray_train_maxtext_dockerfile]
+# Start from a Ray base image which includes JaxTrainer API.
+# Maxtext with TPU requires Python 3.12.
+FROM rayproject/ray:2.49.1-py312
+
+USER root
+RUN groupadd -r ray 2>/dev/null || true && usermod -g ray ray
+
+RUN sudo apt-get update -y \
+  && sudo apt-get install --no-install-recommends -y git \
+  && sudo rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Clone the Maxtext repo and build from source, installing TPU dependencies.
+RUN git clone https://github.com/AI-Hypercomputer/maxtext.git
+
+RUN pip install --no-cache-dir uv
+
+RUN cd maxtext && \
+    uv pip install --no-cache --system -e .[tpu] --resolution=lowest && \
+    install_maxtext_github_deps
+
+# Copy the Ray Maxtext trainer to run on the remote container.
+COPY maxtext_ray_trainer.py .
+
+RUN chown -R ray:ray .
+ENV PYTHONPATH=/app/maxtext/src:/app/maxtext:/app
+USER ray
+# [END gke_ai_ml_gke_ray_ray_train_maxtext_dockerfile]
diff --git a/ai-ml/gke-ray/raytrain/maxtext/maxtext_ray_trainer.py b/ai-ml/gke-ray/raytrain/maxtext/maxtext_ray_trainer.py
@@ -0,0 +1,61 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_ray_train_maxtext_ray_trainer]
+import os
+from absl import app
+import logging
+from typing import Sequence
+import ray
+from ray.train.v2.api.config import ScalingConfig, RunConfig
+from ray.train.v2.jax import JaxTrainer
+
+def train_loop_per_worker(config):
+    from MaxText.train import main as maxtext_main
+
+    argv = config["argv"]
+    maxtext_main(argv)
+
+def main(argv: Sequence[str]):
+    trainer = JaxTrainer(
+        train_loop_per_worker=train_loop_per_worker,
+        train_loop_config={"argv": argv},
+        scaling_config=ScalingConfig(
+            use_tpu=True,
+            num_workers=4,
+            topology="4x4",
+            accelerator_type="TPU-V6E",
+            resources_per_worker={"TPU": 4},
+            placement_strategy="SPREAD",
+        ),
+        run_config=RunConfig(
+            name="maxtext_jaxtrainer",
+            worker_runtime_env={
+                "env_vars": {
+                    "JAX_PLATFORMS": "tpu",
+                    "ENABLE_PJRT_COMPATIBILITY": "true",
+                    "TPU_SLICE_BUILDER_DUMP_CHIP_FORCE": "true",
+                    "TPU_SLICE_BUILDER_DUMP_ICI": "true",
+                    "XLA_FLAGS": "--xla_dump_to=/tmp/xla_dump_file --xla_dump_hlo_as_proto",
+                }
+            },
+        ),
+    )
+    result = trainer.fit()
+    logging.info("Training complete!")
+    ray.shutdown()
+
+if __name__ == "__main__":
+    app.run(main)
+# [END gke_ai_ml_gke_ray_ray_train_maxtext_ray_trainer]
diff --git a/ai-ml/gke-ray/raytrain/maxtext/ray-cluster.tpu-v6e-16.yaml b/ai-ml/gke-ray/raytrain/maxtext/ray-cluster.tpu-v6e-16.yaml
@@ -0,0 +1,119 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_ray_train_maxtext_ray_cluster_tpu_v6e_16]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: maxtext-tpu-cluster
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+      spec:
+        serviceAccountName: ${KSA_NAME}
+        containers:
+          - name: ray-head
+            image: ${DOCKER_IMAGE}
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs-server
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            resources:
+              limits:
+                memory: "16Gi"
+              requests:
+                cpu: "8"
+                memory: "16Gi"
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+        volumes:
+        - name: gcs-fuse-cache
+          emptyDir:
+            medium: Memory
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: gcs-fuse-csi-ephemeral
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: ${GS_BUCKET}
+              mountOptions: "implicit-dirs"
+  workerGroupSpecs:
+    - replicas: 1
+      numOfHosts: 4
+      groupName: tpu-group
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: ${KSA_NAME}
+          containers:
+            - name: ray-worker
+              image: ${DOCKER_IMAGE}
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  memory: 200G
+                  google.com/tpu: "4"
+                requests:
+                  cpu: "8"
+                  memory: 200G
+                  google.com/tpu: "4"
+              env:
+                - name: JAX_PLATFORMS
+                  value: tpu
+                - name: ENABLE_PJRT_COMPATIBILITY
+                  value: "true"
+              volumeMounts:
+              - name: gcs-fuse-csi-ephemeral
+                mountPath: /data
+              - name: dshm
+                mountPath: /dev/shm
+          volumes:
+          - name: gcs-fuse-cache
+            emptyDir:
+              medium: Memory
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: ${GS_BUCKET}
+                mountOptions: "implicit-dirs"
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+            cloud.google.com/gke-tpu-topology: 4x4
+# [END gke_ai_ml_gke_ray_ray_train_maxtext_ray_cluster_tpu_v6e_16]