Merge pull request #2705 from AI-Hypercomputer:chengnuojin-xaot

Google-ML-Automation · Google-ML-Automation · commit b6aaa875eea7 · 2025-11-19T10:48:07.000-08:00
PiperOrigin-RevId: 834357725
diff --git a/.github/workflows/run_pathways_tests_internal.yml b/.github/workflows/run_pathways_tests_internal.yml
@@ -75,7 +75,7 @@ jobs:
           python3 -m pip install -e . --no-dependencies &&
           python3 -m pip uninstall -y libtpu &&
           # TODO(b/454659463): Enable test_default_hlo_match after volume mount is supported.
-          python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest" --durations=0
+          python3 -m pytest ${{ inputs.pytest_addopts }} -v -m "${FINAL_PYTEST_MARKER}" -k "not AotHloIdenticalTest and not CompileThenLoad" --durations=0
     
     services:
       resource_manager:
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
@@ -164,7 +164,7 @@ def should_prevent_cse_in_remat(config):
   return True
 
 
-def load_compiled(config, partial_train, state):
+def load_compiled(config, partial_train, state, execution_devices):
   """# Loading a serialized compiled train step function."""
 
   # Currently partial_train and state  are needed to reconstruct
@@ -187,7 +187,7 @@ def get_train_input_output_trees(func, input_args, input_kwargs):
   shaped_input_args = (state, shaped_batch, example_rng)
   shaped_input_kwargs = {}
   in_tree, out_tree = get_train_input_output_trees(partial_train, shaped_input_args, shaped_input_kwargs)
-  p_train_step = deserialize_and_load(serialized_compiled, in_tree, out_tree)
+  p_train_step = deserialize_and_load(serialized_compiled, in_tree, out_tree, execution_devices=execution_devices)
   return p_train_step
 
 
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
@@ -228,7 +228,12 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
     rng2: A new rng key that can be used in future calls.
 
   """
-  reference_params, reference_params_sharding, extra_dpo_args, _loss_fn = [], [], [], loss_fn
+  reference_params, reference_params_sharding, extra_dpo_args, _loss_fn = (
+      [],
+      [],
+      [],
+      loss_fn,
+  )
   if config.use_dpo:
     state, reference_params = _split_dpo_state(state)
     state_mesh_shardings, reference_params_sharding = _split_dpo_state(state_mesh_shardings)
@@ -252,15 +257,19 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
     if config.optimizer_memory_host_offload:
       if config.use_dpo:
         reference_params = jax.device_put(
-            reference_params, max_utils.with_memory_kind(reference_params_sharding, "device")
+            reference_params,
+            max_utils.with_memory_kind(reference_params_sharding, "device"),
         )
         extra_dpo_args = [reference_params]
     if config.shard_optimizer_over_data:
       params = jax.tree.map(jax.lax.with_sharding_constraint, params, params_shardings)
     grad_func = jax.value_and_grad(_loss_fn, argnums=4, has_aux=True)
     (loss, aux), raw_grads = grad_func(model, config, data, dropout_rng, params, *extra_dpo_args, is_train=True)
 
-  raw_grads = jax.tree_util.tree_map(lambda x: x.astype(config.grad_dtype) if x.dtype == jnp.float32 else x, raw_grads)
+  raw_grads = jax.tree_util.tree_map(
+      lambda x: x.astype(config.grad_dtype) if x.dtype == jnp.float32 else x,
+      raw_grads,
+  )
   intermediate_outputs = aux["intermediate_outputs"]
   total_weights = aux["total_weights"]
   moe_lb_loss = aux["moe_lb_loss"]
@@ -274,7 +283,10 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
     state = state.replace(
         opt_state=jax.device_put(
             state.opt_state,
-            jax.tree_util.tree_map(lambda x: x.with_memory_kind(kind="device"), state_mesh_shardings.opt_state),
+            jax.tree_util.tree_map(
+                lambda x: x.with_memory_kind(kind="device"),
+                state_mesh_shardings.opt_state,
+            ),
         )
     )
   # Move all parameters to device before optimizer update
@@ -378,16 +390,25 @@ def train_loop(config, recorder, state=None):
   params_shardings, state_mesh_shardings = sharding.maybe_update_params_sharding_with_opt(config, state_mesh_shardings)
 
   p_train_step, p_eval_step = train_utils.jit_train_and_eval_step(
-      config, model, mesh, state, state_mesh_shardings, train_step, eval_step, eval_data_iterator, params_shardings
+      config,
+      model,
+      mesh,
+      state,
+      state_mesh_shardings,
+      train_step,
+      eval_step,
+      eval_data_iterator,
+      params_shardings,
   )
 
   with mesh, nn_partitioning.axis_rules(config.logical_axis_rules):
     shaped_batch = maxtext_utils.get_shaped_batch(config)
     if config.shard_optimizer_over_data:
       state = sharding.maybe_shard_with_name(state, state_mesh_shardings, config.shard_mode)
-    compiled = p_train_step.lower(state, shaped_batch, init_rng).compile()
-    compiled_stats = compiled.memory_analysis()
-    max_utils.print_compiled_memory_stats(compiled_stats)
+    if config.compiled_trainstep_file == "":  # compile only when there is no pre-compiled file loaded
+      compiled = p_train_step.lower(state, shaped_batch, init_rng).compile()
+      compiled_stats = compiled.memory_analysis()
+      max_utils.print_compiled_memory_stats(compiled_stats)
 
   start_step = get_first_step(state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
diff --git a/src/MaxText/train_utils.py b/src/MaxText/train_utils.py
@@ -90,10 +90,11 @@ def jit_train_step(config, model, state, state_mesh_shardings, data_sharding, tr
 
   # Define the compilation of functional_train, either by loading the compiled version or wrapping a new one in a jit
   if config.compiled_trainstep_file != "":
-    print("Loading the compiled function...", flush=True)
+    max_logging.log("Loading the compiled function...")
+    execution_devices = model.mesh.devices.flatten().tolist()
     # Need to pass train signature and state to determine i/o shapes of train_state for now.
-    p_train_step = maxtext_utils.load_compiled(config, functional_train, state)
-    print("Loaded compiled function!", flush=True)
+    p_train_step = maxtext_utils.load_compiled(config, functional_train, state, execution_devices)
+    max_logging.log("Loaded compiled function!")
   else:
     p_train_step = jax.jit(
         functional_train,
diff --git a/tests/xaot_test.py b/tests/xaot_test.py
@@ -0,0 +1,128 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+These tests verify the Compile-Then-Load workflow.
+It ensures that a model compiled via train_compile.py can be successfully
+loaded and executed by train.py.
+"""
+
+import tempfile
+import unittest
+import pytest
+import os
+import shutil
+import jax
+from MaxText.globals import MAXTEXT_PKG_DIR
+from MaxText import train_compile
+from MaxText import train
+
+
+class CompileThenLoadTest(unittest.TestCase):
+  """Tests for the Split Compile and Train workflow"""
+
+  def setUp(self):
+    """Create a temporary directory for the compiled pickle file."""
+    self.temp_dir = tempfile.mkdtemp()
+    self.pickle_file = os.path.join(self.temp_dir, "test_compiled_train.pickle")
+
+    # Ensure JAX cache doesn't interfere with clean test runs
+    jax.config.update("jax_enable_compilation_cache", False)
+
+  def tearDown(self):
+    """Clean up the temporary directory."""
+    if os.path.exists(self.temp_dir):
+      shutil.rmtree(self.temp_dir)
+
+  def get_device_user_facing_name(self):
+    """Gets TPU device user facing name to generate correct AOT arguments."""
+    devices = jax.devices()
+    if not devices or "tpu" not in devices[0].platform.lower():
+      pytest.skip("This test requires a TPU environment.")
+
+    num_devices = len(devices)
+    device_kind = devices[0].device_kind
+    device_info = {
+        "TPU v4": ("v4", 2 * num_devices),
+        "TPU v5 lite": ("v5e", num_devices),
+        "TPU v5": ("v5p", 2 * num_devices),
+        "TPU v6": ("v6e", num_devices),
+    }
+
+    prefix, topology_devices = next((v for k, v in device_info.items() if k in device_kind), (None, None))
+    if prefix is None:
+      raise ValueError(f"Unsupported TPU device kind for AOT test: {device_kind}")
+
+    return f"{prefix}-{topology_devices}"
+
+  def run_compile_then_load(self, test_name, *extra_args):
+    """
+    Executes the compile step, checks for pickle existence,
+    then executes the load/train step.
+    """
+
+    # Common arguments derived from your request
+    shared_args = [
+        "global_parameter_scale=1",
+        "per_device_batch_size=4",
+        "steps=1",
+        "learning_rate=1e-3",
+        "dataset_type=synthetic",
+        "enable_checkpointing=False",
+    ]
+
+    if extra_args:
+      shared_args.extend(extra_args)
+
+    # Compilation
+    topology = self.get_device_user_facing_name()
+
+    compile_specific_args = [
+        f"compile_topology={topology}",
+        "compile_topology_num_slices=1",
+        f"compiled_trainstep_file={self.pickle_file}",
+    ]
+
+    compile_argv = (
+        (None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")) + tuple(shared_args) + tuple(compile_specific_args)
+    )
+
+    print(f"\n--- Starting Compilation Step for {test_name} ---")
+    # Clear caches before compile to ensure clean state
+    jax.clear_caches()
+    train_compile.main(compile_argv)
+
+    # Assert the pickle file was actually created
+    assert os.path.exists(self.pickle_file), f"Compilation failed: {self.pickle_file} was not created."
+
+    load_specific_args = [
+        "base_output_directory=gs://runner-maxtext-logs",
+        f"run_name=compile_then_load_{test_name}",
+        f"compiled_trainstep_file={self.pickle_file}",
+    ]
+
+    train_argv = (
+        (None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")) + tuple(shared_args) + tuple(load_specific_args)
+    )
+
+    print(f"\n--- Starting Load/Train Step for {test_name} ---")
+    # Clear caches before train to ensure we are actually loading from the pickle
+    jax.clear_caches()
+    train.main(train_argv)
+
+    print(f"Successfully compiled and loaded for test {test_name}!")
+
+  @pytest.mark.tpu_only
+  def test_default_compile_load(self):
+    self.run_compile_then_load("default_run")