AI-Hypercomputer
diff --git a/‎.github/workflows/RunTests.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/RunTests.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/run_tests_against_package.yml‎
Lines changed: 15 additions & 6 deletions b/‎.github/workflows/run_tests_against_package.yml‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎.github/workflows/run_tests_internal.yml‎
Lines changed: 12 additions & 2 deletions b/‎.github/workflows/run_tests_internal.yml‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 3 deletions b/‎README.md‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎base_requirements/requirements.txt‎
Lines changed: 4 additions & 3 deletions b/‎base_requirements/requirements.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎benchmarks/maxtext_trillium_model_configs.py‎
Lines changed: 119 additions & 0 deletions b/‎benchmarks/maxtext_trillium_model_configs.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎docker_build_dependency_image.sh‎
Lines changed: 11 additions & 11 deletions b/‎docker_build_dependency_image.sh‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎docs/guides.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/guides.md‎
Lines changed: 1 addition & 0 deletions
@@ -65,17 +65,22 @@ jobs:
 
   cpu_unit_tests:
     needs: tpu_image
+    strategy:
+      fail-fast: false
+      matrix:
+        worker_group: [1, 2, 3, 4]
     uses: ./.github/workflows/run_tests_internal.yml
     with:
       device_type: cpu
       device_name: X64
-      cloud_runner: linux-x86-n2-16
       image_type: tpu
       pytest_marker: 'cpu_only'
       xla_python_client_mem_fraction: 0.75
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      worker_group: ${{ matrix.worker_group }}
+      total_workers: 4
 
   tpu_unit_tests:
     needs: tpu_image
 
@@ -51,6 +51,7 @@ jobs:
         fail-fast: false # don't cancel all jobs on failure
         matrix:
           image_type: ["py312"]
+          worker_group: [1, 2, 3, 4]
     with:
       device_type: cpu
       device_name: X64
@@ -61,6 +62,8 @@ jobs:
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      worker_group: ${{ matrix.worker_group }}
+      total_workers: 4
 
   maxtext_tpu_unit_tests:
     needs: build_and_upload_maxtext_package
 
@@ -31,10 +31,6 @@ on:
       pytest_marker:
         required: true
         type: string
-      pytest_addopts:
-        required: false
-        type: string
-        default: ''
       is_scheduled_run:
         required: true
         type: string
@@ -50,12 +46,20 @@ on:
       cloud_runner:
         required: false
         type: string
+      worker_group:
+        required: false
+        type: number
+        default: 1
+      total_workers:
+        required: false
+        type: number
+        default: 1
 
 permissions:
   contents: read
 jobs:
   run:
-    runs-on: ${{ inputs.cloud_runner }}
+    runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
     container:
       image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
       env:
@@ -97,5 +101,10 @@ jobs:
           export MAXTEXT_ASSETS_ROOT=$(pwd)/src/MaxText/assets
           export MAXTEXT_TEST_ASSETS_ROOT=$(pwd)/src/MaxText/test_assets
           export MAXTEXT_PKG_DIR=$(pwd)/src/MaxText
+          # omit this libtpu init args for gpu tests
+          if [ "${{ inputs.device_type }}" != "cuda12" ]; then
+            export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536'
+          fi
           # TODO: Fix the skipped tests and remove the deselect flags
-          .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" --durations=0 --deselect "tests/aot_hlo_identical_test.py::AotHloIdenticalTest::test_default_hlo_match" --deselect "tests/tokenizer_test.py::TokenizerTest::test_detokenize"
+          [ "${{ inputs.total_workers }}" -gt 1 ] && .venv/bin/python3 -m pip install --quiet pytest-split && SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }}" || SPLIT_ARGS=""
+          .venv/bin/python3 -m pytest -v -m "${FINAL_PYTEST_MARKER}" --durations=0 --deselect "tests/aot_hlo_identical_test.py::AotHloIdenticalTest::test_default_hlo_match" --deselect "tests/tokenizer_test.py::TokenizerTest::test_detokenize" $SPLIT_ARGS
@@ -50,6 +50,14 @@ on:
       cloud_runner:
         required: false
         type: string
+      worker_group:
+        required: false
+        type: number
+        default: 1
+      total_workers:
+        required: false
+        type: number
+        default: 1
 
 jobs:
   run:
@@ -70,5 +78,7 @@ jobs:
           else
             FINAL_PYTEST_MARKER="${{ inputs.pytest_marker }} and not scheduled_only"
           fi
-          python3 -m pip install -e . --no-dependencies &&
-          LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536' python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" --durations=0
+          python3 -m pip install -e . --no-dependencies
+          [ "${{ inputs.total_workers }}" -gt 1 ] && python3 -m pip install --quiet pytest-split && SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }}" || SPLIT_ARGS=""
+          export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536'
+          python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" --durations=0 $SPLIT_ARGS
@@ -36,10 +36,14 @@ We recommend installing MaxText inside a Python virtual environment.
 This is the easiest way to get started with the latest stable version.
 
 ```bash
-# 1. Install uv, a fast Python package installer
+# 1. Create virtual environment
+uv venv --python 3.12 --seed maxtext_venv
+source maxtext_venv/bin/activate
+
+# 2. Install uv, a fast Python package installer
 pip install uv
 
-# 2. Install MaxText and its dependencies
+# 3. Install MaxText and its dependencies
 uv pip install maxtext --resolution=lowest
 install_maxtext_github_deps
 ```
@@ -55,7 +59,11 @@ If you plan to contribute to MaxText or need the latest unreleased features, ins
 git clone https://github.com/AI-Hypercomputer/maxtext.git
 cd maxtext
 
-# 2. Install dependencies in editable mode
+# 2. Create virtual environment
+uv venv --python 3.12 --seed maxtext_venv
+source maxtext_venv/bin/activate
+
+# 3. Install dependencies in editable mode
 pip install uv
 # install the tpu package
 uv pip install -e .[tpu] --resolution=lowest
 
@@ -11,8 +11,8 @@ google-cloud-aiplatform
 google-cloud-monitoring
 grain[parquet]
 huggingface_hub
-jax!=0.7.1, !=0.7.2
-jaxlib!=0.7.1, !=0.7.2
+jax
+jaxlib
 jaxtyping
 jsonlines
 ml-collections
@@ -36,7 +36,8 @@ tensorflow-datasets
 tensorflow-text
 tensorflow
 tiktoken
+tokamax
 transformers
 qwix
-google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/daedc21c393f23449fb54ddc4f75fca34348ea9c.zip
+google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
 mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip
@@ -1714,6 +1714,125 @@
     ),
 )
 
+gemma3_12b_32768_v6e256 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="gemma3-12b-32768-v6e256",
+        model_type="gemma3-12b",
+        tuning_params={
+            "per_device_batch_size": 1,
+            "num_vocab_tiling": 16,
+            "ici_fsdp_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "device",
+            "query_proj": "remat",
+            "key_proj": "remat",
+            "value_proj": "remat",
+            "max_target_length": 32768,
+            "attention": "flash",
+            "gcs_metrics": True,
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "reuse_example_batch": 1,
+            "enable_checkpointing": False,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 2,
+            "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"),
+            "sa_block_q": 1024,
+            "sa_block_kv": 1024,
+            "sa_block_kv_compute": 1024,
+            "sa_block_q_dkv": 512,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 512,
+            "sa_block_q_dq": 1024,
+            "sa_block_kv_dq": 1024,
+        },
+        xla_flags=(xla_flags_library.CUSTOM_VMEM_LIMIT_FLAG(vmem_limit=122880)),
+    ),
+)
+
+gemma3_12b_32768_2x_v6e256 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="gemma3-12b-32768-2x-v6e256",
+        model_type="gemma3-12b",
+        tuning_params={
+            "per_device_batch_size": 1,
+            "num_vocab_tiling": 16,
+            "ici_fsdp_parallelism": 1,
+            "ici_fsdp_transpose_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "device",
+            "query_proj": "remat",
+            "key_proj": "remat",
+            "value_proj": "remat",
+            "max_target_length": 32768,
+            "attention": "flash",
+            "gcs_metrics": True,
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "reuse_example_batch": 1,
+            "enable_checkpointing": False,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 2,
+            "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"),
+            "sa_block_q": 1024,
+            "sa_block_kv": 1024,
+            "sa_block_kv_compute": 1024,
+            "sa_block_q_dkv": 512,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 512,
+            "sa_block_q_dq": 1024,
+            "sa_block_kv_dq": 1024,
+        },
+        xla_flags=(xla_flags_library.CUSTOM_VMEM_LIMIT_FLAG(vmem_limit=122880)),
+    ),
+)
+
+gemma3_12b_32768_4x_v6e256 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="gemma3-12b-32768-4x-v6e256",
+        model_type="gemma3-12b",
+        tuning_params={
+            "per_device_batch_size": 1,
+            "num_vocab_tiling": 16,
+            "ici_fsdp_parallelism": 1,
+            "ici_fsdp_transpose_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "device",
+            "query_proj": "remat",
+            "key_proj": "remat",
+            "value_proj": "remat",
+            "max_target_length": 32768,
+            "attention": "flash",
+            "gcs_metrics": True,
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "reuse_example_batch": 1,
+            "enable_checkpointing": False,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 2,
+            "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"),
+            "sa_block_q": 1024,
+            "sa_block_kv": 1024,
+            "sa_block_kv_compute": 1024,
+            "sa_block_q_dkv": 512,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 512,
+            "sa_block_q_dq": 1024,
+            "sa_block_kv_dq": 1024,
+        },
+        xla_flags=(xla_flags_library.CUSTOM_VMEM_LIMIT_FLAG(vmem_limit=122880)),
+    ),
+)
+
 # Config for Llama3.1 70B model with 131072 max target length aka context length
 llama3_1_70b_131072 = _add_to_model_dictionary(
     trillium_model_dict,
 
@@ -27,7 +27,7 @@
 # works with any custom wheels.
 # bash docker_build_dependency_image.sh MODE=custom_wheels
 
-# bash docker_build_dependency_image.sh MODE=grpo
+# bash docker_build_dependency_image.sh MODE=post-training
 
 # Enable "exit immediately if any command fails" option
 set -e
@@ -68,17 +68,17 @@ if [[ -z ${MODE} ]]; then
   export MODE=stable
   echo "Default MODE=${MODE}"
   export CUSTOM_JAX=0
-  export INSTALL_GRPO=0
+  export INSTALL_POST_TRAINING=0
 elif [[ ${MODE} == "custom_wheels" ]] ; then
   export MODE=nightly
   export CUSTOM_JAX=1
-  export INSTALL_GRPO=0
-elif [[ ${MODE} == "grpo" || ${MODE} == "grpo-experimental" ]] ; then
-  export INSTALL_GRPO=1
+  export INSTALL_POST_TRAINING=0
+elif [[ ${MODE} == "post-training" || ${MODE} == "post-training-experimental" ]] ; then
+  export INSTALL_POST_TRAINING=1
   export CUSTOM_JAX=0
 else
   export CUSTOM_JAX=0
-  export INSTALL_GRPO=0
+  export INSTALL_POST_TRAINING=0
 fi
 
 if [[ -z ${DEVICE} ]]; then
@@ -124,8 +124,8 @@ if [[ -z ${LIBTPU_GCS_PATH+x} ]] ; then
     elif [[ ${MANTARAY} == "true" ]]; then
       echo "Building with benchmark-db"
       docker build --network host --build-arg MODE=${MODE} --build-arg JAX_VERSION=$JAX_VERSION --build-arg LIBTPU_GCS_PATH=$LIBTPU_GCS_PATH --build-arg DEVICE=$DEVICE -f ./maxtext_db_dependencies.Dockerfile -t ${LOCAL_IMAGE_NAME} .
-    elif [[ ${INSTALL_GRPO} -eq 1 && ${DEVICE} == "tpu" ]]; then
-      echo "Installing MaxText stable mode dependencies for GRPO"
+    elif [[ ${INSTALL_POST_TRAINING} -eq 1 && ${DEVICE} == "tpu" ]]; then
+      echo "Installing MaxText stable mode dependencies for Post-Training"
       docker build --network host --build-arg MODE=stable --build-arg JAX_VERSION=$JAX_VERSION --build-arg LIBTPU_GCS_PATH=$LIBTPU_GCS_PATH --build-arg DEVICE=$DEVICE -f ./maxtext_dependencies.Dockerfile -t ${LOCAL_IMAGE_NAME} .
     else
       docker build --network host --build-arg MODE=${MODE} --build-arg JAX_VERSION=$JAX_VERSION --build-arg LIBTPU_GCS_PATH=$LIBTPU_GCS_PATH --build-arg DEVICE=$DEVICE -f ./maxtext_dependencies.Dockerfile -t ${LOCAL_IMAGE_NAME} .
@@ -136,9 +136,9 @@ else
   docker build --network host --build-arg CUSTOM_LIBTPU=true -f ./maxtext_libtpu_path.Dockerfile -t ${LOCAL_IMAGE_NAME} .
 fi
 
-if [[ ${INSTALL_GRPO} -eq 1 ]] ; then
+if [[ ${INSTALL_POST_TRAINING} -eq 1 ]] ; then
   if [[ ${DEVICE} != "tpu" ]] ; then
-    echo "Error: MODE=grpo is only supported for DEVICE=tpu"
+    echo "Error: MODE=post-training is only supported for DEVICE=tpu"
     exit 1
   fi
 
@@ -158,7 +158,7 @@ if [[ ${INSTALL_GRPO} -eq 1 ]] ; then
     --network host \
     --build-arg BASEIMAGE=${LOCAL_IMAGE_NAME} \
     --build-arg MODE=${MODE} \
-    -f ./maxtext_grpo_dependencies.Dockerfile \
+    -f ./maxtext_post_training_dependencies.Dockerfile \
     -t ${LOCAL_IMAGE_NAME} .
 fi
 
 
@@ -31,4 +31,5 @@ guides/pallas_kernels_performance.md
 guides/understand_logs_and_metrics.md
 guides/xprof_user_guide.md
 guides/checkpointing_solutions.md
+guides/megascale_hang_playbook.md
 ```