AI-Hypercomputer
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 19 additions & 6 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎src/MaxText/data_loader.py‎
Lines changed: 31 additions & 45 deletions b/‎src/MaxText/data_loader.py‎
Lines changed: 31 additions & 45 deletions
diff --git a/‎src/MaxText/elastic_train.py‎
Lines changed: 4 additions & 0 deletions b/‎src/MaxText/elastic_train.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/MaxText/rampup_batch.py‎
Lines changed: 112 additions & 0 deletions b/‎src/MaxText/rampup_batch.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎src/MaxText/sft_trainer.py‎
Lines changed: 2 additions & 0 deletions b/‎src/MaxText/sft_trainer.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/MaxText/train.py‎
Lines changed: 3 additions & 3 deletions b/‎src/MaxText/train.py‎
Lines changed: 3 additions & 3 deletions
@@ -1378,6 +1378,11 @@ class DerivedValues(BaseModel):
       description="The total size of context parallelism, derived from ICI and DCN values.",
   )
 
+  num_target_devices: None | int = Field(
+      None,
+      description="The number of devices computed from topology in train_compile or jax.devices() in train",
+  )
+
   global_batch_size_to_train_on: None | int = Field(
       None,
       description="The total batch size for training across all devices. Derived from `per_device_batch_size` and data"
@@ -1640,9 +1645,9 @@ def get_num_target_devices():
       else:
         return len(jax.devices())
 
-    num_devices = 1  # Default for validation when JAX is not initialized
+    self.num_target_devices = 1  # Default for validation when JAX is not initialized
     try:
-      num_devices = get_num_target_devices()
+      self.num_target_devices = get_num_target_devices()
     except (RuntimeError, IndexError):
       logger.warning("JAX device system not available for config validation. Assuming 1 device.")
 
@@ -1679,15 +1684,20 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         self.global_batch_size_to_train_on,
         self.micro_batch_size_to_train_on,
     ) = calculate_global_batch_sizes(
-        self.per_device_batch_size, self.expansion_factor_real_data, num_devices, self.gradient_accumulation_steps
+        self.per_device_batch_size,
+        self.expansion_factor_real_data,
+        self.num_target_devices,
+        self.gradient_accumulation_steps,
     )
 
     # Calculate final evaluation batch sizes.
     (
         self.global_batch_size_to_load_eval,
         self.global_batch_size_to_eval_on,
         self.micro_batch_size_to_eval_on,
-    ) = calculate_global_batch_sizes(self.eval_per_device_batch_size, self.expansion_factor_real_data, num_devices, 1)
+    ) = calculate_global_batch_sizes(
+        self.eval_per_device_batch_size, self.expansion_factor_real_data, self.num_target_devices, 1
+    )
 
     # Calculate ramp-up batch size parameters if enabled.
     if self.enable_rampup_batch_size:
@@ -1696,7 +1706,10 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           _,
           _,
       ) = calculate_global_batch_sizes(
-          self.per_device_batch_size_start, self.expansion_factor_real_data, num_devices, self.gradient_accumulation_steps
+          self.per_device_batch_size_start,
+          self.expansion_factor_real_data,
+          self.num_target_devices,
+          self.gradient_accumulation_steps,
       )
       (
           self.global_batch_size_to_load_increment,
@@ -1705,7 +1718,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
       ) = calculate_global_batch_sizes(
           self.per_device_batch_size_increment,
           self.expansion_factor_real_data,
-          num_devices,
+          self.num_target_devices,
           self.gradient_accumulation_steps,
       )
       diff_batch_size = self.global_batch_size_to_load - self.global_batch_size_to_load_start
 
@@ -20,7 +20,7 @@
 from jax.experimental import checkify
 
 from MaxText import exceptions
-from MaxText import max_logging
+from MaxText.sharding import get_input_data_sharding, maybe_shard_with_name
 from MaxText.utils.goodput_utils import (
     GoodputEvent,
     maybe_record_goodput,
@@ -42,15 +42,16 @@ def __init__(self, config, mesh, data_iterator, goodput_recorder):
     else:
       self.data_iterator = data_iterator
     self.last_batch = None
+    self.input_data_shardings = get_input_data_sharding(config, mesh)
 
   def update_data_iterator(self):
     """Update to the next data iterator in the list, if applicable."""
     if hasattr(self, "data_iterator_list"):
       self.data_iterator_index = (self.data_iterator_index + 1) % len(self.data_iterator_list)
       self.data_iterator = self.data_iterator_list[self.data_iterator_index]
 
-  def load_next_batch(self):
-    """Loads the next batch. Can keep reusing the same batch for performance reasons."""
+  def load_next_batch_pre_sharding(self):
+    """Loads the next batch w/o sharding. Can keep reusing the same batch for performance reasons."""
     with maybe_record_goodput(self.goodput_recorder, GoodputEvent.DATA_LOADING):
       try:
         if self.config.reuse_example_batch and self.last_batch:
@@ -67,6 +68,14 @@ def load_next_batch(self):
           raise exceptions.StopTraining(f"`load_next_batch()` failed with {type(e)} exception: ({e}).")
     return self.last_batch
 
+  def load_next_batch(self, *args, **kwargs):
+    """Loads the next batch with sharding hint"""
+    return maybe_shard_with_name(
+        self.load_next_batch_pre_sharding(),
+        self.input_data_shardings,
+        self.config.shard_mode,
+    )
+
   def check_example_batch(self):
     if self.config.max_checkify:
       jittable_f = checkify.checkify(lambda x: checkify.check(jnp.any(x > -1), "Batch contains bad synthetic data!"))
@@ -90,22 +99,11 @@ def __init__(self, config, mesh, data_iterator, goodput_recorder):
     # Call parent constructor
     super().__init__(config, mesh, data_iterator, goodput_recorder)
 
-    # Get ramp-up parameters from config, with safe defaults
-    self.global_batch_size_end = config.global_batch_size_to_load
-    self.global_batch_size_start = config.global_batch_size_to_load_start
-    self.increment = config.global_batch_size_to_load_increment
-    self.samples_per_increment = config.rampup_samples_per_increment_to_load
-
-    # Check if ramp-up is active
-    self.rampup_active = self.global_batch_size_start < self.global_batch_size_end
-
-    # State for tracking ramp-up
-    self.accum_samples = 0
-    self.global_batch_size_current = self.global_batch_size_start
+    self.rampup_active = True
     self.batch_buffer = None
     self.buffer_start = 0
 
-  def load_next_batch(self):
+  def load_next_batch(self, *args, rampup_manager=None, **kwargs):
     """
     Updates the batch size based on the schedule and then loads the next
     batch using the parent method.
@@ -114,68 +112,56 @@ def load_next_batch(self):
     if not self.rampup_active:
       return super().load_next_batch()
 
-    # If in rampup phase, we use batch buffer to save data
-    # Check if it's time to increment the batch size
-    is_time_to_increment = self.accum_samples >= self.samples_per_increment
-
-    if is_time_to_increment:
-      # Update current batch size and refresh accumulate samples
-      max_logging.log(
-          f"Global batch size increments from {self.global_batch_size_current}"
-          f" to {self.global_batch_size_current + self.increment}"
-      )
-      self.global_batch_size_current += self.increment
-      self.accum_samples = 0
-      self.rampup_active = self.global_batch_size_current < self.global_batch_size_end
-
-    self.accum_samples += self.global_batch_size_current
-    slice_start, slice_end = self.buffer_start, self.buffer_start + self.global_batch_size_current
+    slice_start, slice_end = self.buffer_start, self.buffer_start + rampup_manager.global_batch_size_current
 
-    # Load new batch if batch_buffer is None or slice overpast the buffer end
+    # Load new batch if batch_buffer is None
     if self.batch_buffer is None:
-      self.batch_buffer = super().load_next_batch()
-      slice_start, slice_end = 0, self.global_batch_size_current
+      self.batch_buffer = super().load_next_batch_pre_sharding()
+      slice_start, slice_end = 0, rampup_manager.global_batch_size_current
 
-    if slice_end > self.global_batch_size_end:
-      old_buffer, self.batch_buffer = self.batch_buffer, super().load_next_batch()
+    # If the slice end overpast batch end we collect new batch data
+    if slice_end > rampup_manager.global_batch_size_end:
+      old_buffer, self.batch_buffer = self.batch_buffer, super().load_next_batch_pre_sharding()
 
       # self.global_batch_size_end is batch_buffer size
       def _slice_and_concat(old_data, new_data):
         sliced_old_data = jax.lax.dynamic_slice_in_dim(
             old_data,
             slice_start,
-            self.global_batch_size_end - slice_start,
+            rampup_manager.global_batch_size_end - slice_start,
             axis=0,
         )
         sliced_new_data = jax.lax.dynamic_slice_in_dim(
             new_data,
             0,
-            slice_end - self.global_batch_size_end,
+            slice_end - rampup_manager.global_batch_size_end,
             axis=0,
         )
         return jax.lax.concatenate((sliced_old_data, sliced_new_data), dimension=0)
 
-      self.buffer_start = slice_end - self.global_batch_size_end
-      return jax.tree.map(_slice_and_concat, old_buffer, self.batch_buffer)
+      self.buffer_start = slice_end - rampup_manager.global_batch_size_end
+      output = jax.tree.map(_slice_and_concat, old_buffer, self.batch_buffer)
     else:
 
       def _slice(data):
         return jax.lax.dynamic_slice_in_dim(
             data,
             slice_start,
-            self.global_batch_size_current,
+            rampup_manager.global_batch_size_current,
             axis=0,
         )
 
       self.buffer_start = slice_end
-      return jax.tree.map(_slice, self.batch_buffer)
+      output = jax.tree.map(_slice, self.batch_buffer)
+    self.rampup_active = rampup_manager.update()
+    return maybe_shard_with_name(output, self.input_data_shardings, self.config.shard_mode)
 
 
-def create_dataloader(config, mesh, data_iterator, goodput_recorder):
+def create_dataloader(config, mesh, data_iterator, goodput_recorder, rampup_manager):
   """
   Create the dataloader
   """
-  if config.enable_rampup_batch_size:
+  if rampup_manager and rampup_manager.num_accum_samples < config.global_rampup_samples:
     return RampUpDataLoader(config, mesh, data_iterator, goodput_recorder)
   else:
     return DataLoader(config, mesh, data_iterator, goodput_recorder)
@@ -124,6 +124,8 @@ def elastic_handler(
         learning_rate_schedule,
         data_iterator,
         _,
+        _,
+        _,
         state,
     ) = setup_train_loop(config, recorder, elastic_manager.good_devices)
 
@@ -178,6 +180,8 @@ def train_loop(config, elastic_manager, recorder, state=None):
       learning_rate_schedule,
       data_iterator,
       _,
+      _,
+      _,
       state,
   ) = setup_train_loop(config, recorder)
 
 
@@ -0,0 +1,112 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pytype: disable=unsupported-operands
+"""Module to save batch size managing classes."""
+
+import math
+
+
+class RampupBatchManager:
+  """
+  A stateful class tracking current batch size given train step
+  """
+
+  def __init__(self, config, step_num):
+    self._verify_inputs(config)
+    self._init_values(config)
+    self.num_accum_samples = 0
+
+    # Compute the number of samples already used given recovered step num
+    self._recover_states(step_num)
+
+  def _verify_inputs(self, config):
+    """Verify the rampup batch related inputs."""
+    diff_batch_size = config.per_device_batch_size - config.per_device_batch_size_start
+    if diff_batch_size <= 0:
+      raise ValueError(
+          "per_device_batch_size must be greater than per_device_batch_size_start. "
+          f"get batch size is {config.per_device_batch_size} and "
+          f"batch size start is {config.per_device_batch_size_start}."
+      )
+    if diff_batch_size % config.per_device_batch_size_increment:
+      raise ValueError(
+          "Expect rampup batch size change divisible by batch size increment."
+          f"Got per_device_batch_size={config.per_device_batch_size} and "
+          f"per_device_batch_size_start={config.per_device_batch_size_start}."
+      )
+
+  def _init_values(self, config):
+    """Initialize rampup batch related parameters"""
+    diff_batch_size = config.per_device_batch_size - config.per_device_batch_size_start
+    num_increments = diff_batch_size // config.per_device_batch_size_increment
+    self.samples_per_increment = config.global_rampup_samples / num_increments
+    num_devices = int(config.num_target_devices)
+    self.global_batch_size_end = int(num_devices * config.per_device_batch_size)
+    self.global_batch_size_start = int(num_devices * config.per_device_batch_size_start)
+    self.increment = int(num_devices * config.per_device_batch_size_increment)
+    self.global_rampup_samples = config.global_rampup_samples
+    self.global_batch_size_current = self.global_batch_size_start
+    self.total_rampup_steps = self._compute_total_rampup_steps(config)
+    self.total_used_samples = 0
+
+  def _compute_total_rampup_steps(self, config):
+    """Compute total number of rampup steps"""
+    batch_size_start = config.per_device_batch_size_start
+    batch_size_end = config.per_device_batch_size
+    batch_size_increment = config.per_device_batch_size_increment
+    diff_batch_size = batch_size_end - batch_size_start
+    num_increments = diff_batch_size // batch_size_increment
+    rampup_samples = config.global_rampup_samples / config.num_target_devices
+    rampup_samples_per_increment = rampup_samples / num_increments
+    total_rampup_steps = 0
+    current_batch_size = batch_size_start
+
+    while current_batch_size < batch_size_end:
+      steps_for_this_stage = math.ceil(rampup_samples_per_increment / current_batch_size)
+      total_rampup_steps += steps_for_this_stage
+      current_batch_size += batch_size_increment
+    return total_rampup_steps
+
+  def _recover_states(self, step_num):
+    """Recover the number of samples already used"""
+    if step_num < 0:
+      return
+    for _ in range(step_num):
+      _ = self.update()
+    return
+
+  def update(self):
+    """Update values when load_batch is called"""
+    self.total_used_samples += self.global_batch_size_current
+    self.num_accum_samples += self.global_batch_size_current
+    # Check if it's time to increment the batch size
+    is_time_to_increment = self.num_accum_samples >= self.samples_per_increment
+    if is_time_to_increment:
+      self.global_batch_size_current = min(self.increment + self.global_batch_size_current, self.global_batch_size_end)
+      self.num_accum_samples = 0
+    # return whether rampup phase is active or not
+    return self.global_batch_size_current < self.global_batch_size_end
+
+
+def create_rampup_manager(config, checkpoint_manager):
+  if not config.enable_rampup_batch_size:
+    return None
+
+  # Current step default as -1 if no checkpoint exists
+  current_step = -1
+  if checkpoint_manager and checkpoint_manager.latest_step():
+    current_step = checkpoint_manager.latest_step()
+
+  return RampupBatchManager(config, current_step)
@@ -65,6 +65,8 @@ def train_loop(config, recorder, state=None):
       mesh,
       learning_rate_schedule,
       data_iterator,
+      _,
+      _,
       eval_data_iterator,
       state,
   ) = setup_train_loop(config, recorder)
 
@@ -53,7 +53,6 @@
 from MaxText import sharding
 from MaxText.layers.multi_token_prediction import calculate_mtp_acceptance_rate, calculate_mtp_loss
 from MaxText.common_types import ShardMode
-from MaxText.data_loader import create_dataloader
 from MaxText.globals import EPS
 from MaxText.metric_logger import MetricLogger
 from MaxText.utils import gcs_utils
@@ -377,6 +376,8 @@ def train_loop(config, recorder, state=None):
       mesh,
       learning_rate_schedule,
       data_iterator,
+      data_loader,
+      rampup_manager,
       eval_data_iterator,
       state,
   ) = train_utils.setup_train_loop(config, recorder)
@@ -412,7 +413,6 @@ def train_loop(config, recorder, state=None):
 
   start_step = get_first_step(state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
-  data_loader = create_dataloader(config, mesh, data_iterator, recorder)
   metric_logger = MetricLogger(config=config, learning_rate_schedule=learning_rate_schedule)
 
   # Write train config params, num model params, and XLA flags to tensorboard
@@ -424,7 +424,7 @@ def train_loop(config, recorder, state=None):
       prof.maybe_activate_profiler(step, state)
 
       with jax.profiler.StepTraceAnnotation("train", step_num=step):
-        example_batch = data_loader.load_next_batch()
+        example_batch = data_loader.load_next_batch(rampup_manager=rampup_manager)
         # Reshard data from loaded sharding to performant activation sharding
         example_batch = sharding.maybe_shard_with_name(
             example_batch,