AI-Hypercomputer
diff --git a/‎src/MaxText/common_types.py‎
Lines changed: 5 additions & 0 deletions b/‎src/MaxText/common_types.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/MaxText/data_loader.py‎
Lines changed: 5 additions & 1 deletion b/‎src/MaxText/data_loader.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/MaxText/gradient_accumulation.py‎
Lines changed: 33 additions & 5 deletions b/‎src/MaxText/gradient_accumulation.py‎
Lines changed: 33 additions & 5 deletions
diff --git a/‎src/MaxText/layers/attention_mla.py‎
Lines changed: 2 additions & 1 deletion b/‎src/MaxText/layers/attention_mla.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/MaxText/layers/attention_op.py‎
Lines changed: 18 additions & 3 deletions b/‎src/MaxText/layers/attention_op.py‎
Lines changed: 18 additions & 3 deletions
@@ -100,3 +100,8 @@ class AttentionType(enum.Enum):
   CHUNK = "chunk"
   MLA = "mla"
   FULL = "full"
+
+
+class ShardMode(enum.Enum):
+  AUTO = "auto"  # default
+  EXPLICIT = "explicit"
@@ -353,6 +353,7 @@ jax_cache_dir: "~/jax_cache"
 hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu', 'gpu_multiprocess' and 'cpu'
 
 # Parallelism
+shard_mode: "auto" # can be either auto or explicit
 mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
 
@@ -48,7 +48,11 @@ def load_next_batch(self):
         else:
           example_batch = next(self.data_iterator)
         # Reshard data from loaded sharding to performant activation sharding
-        self.last_batch = jax.lax.with_sharding_constraint(example_batch, self.input_data_shardings)
+        self.last_batch = maxtext_utils.maybe_shard_with_name(
+            example_batch,
+            self.input_data_shardings,
+            self.config.shard_mode,
+        )
         self.check_example_batch()
       except Exception as e:  # pylint: disable=broad-except
         if isinstance(e, StopIteration):
 
@@ -16,6 +16,10 @@
 
 import jax
 import jax.numpy as jnp
+from jax.sharding import NamedSharding
+
+from MaxText.common_types import ShardMode
+from MaxText.maxtext_utils import maybe_shard_with_name
 
 
 def gradient_accumulation_loss_and_grad(
@@ -58,6 +62,17 @@ def gradient_accumulation_loss_and_grad(
       - final_aux (PyTree): Auxiliary outputs, summed across microbatches.
       - raw_grads (PyTree): The accumulated and averaged gradients.
   """
+
+  def _maybe_shard_with_name(inputs, sharding_names):
+    """Wrapper of maybe_shard_with_name with fixed shard_mode"""
+    return maybe_shard_with_name(inputs, sharding_names, config.shard_mode)
+
+  # For more efficient DP/ZeRO-1 + GA
+  if config.shard_mode == ShardMode.EXPLICIT and config.ici_data_parallelism > 1:
+    ga_params_shardings = jax.tree.map(update_sharding_for_reduced, params_shardings)
+    grad_shardings = jax.tree.map(update_sharding_for_unreduced, params_shardings)
+  else:
+    ga_params_shardings = grad_shardings = params_shardings
   # When using Zero-1 optimizer sharding, cast params to lower precision and apply sharding constraints
   # so that all-gather is done once in the lower precision before the gradient accumulation loop
   if config.shard_optimizer_over_data:
@@ -68,15 +83,14 @@ def convert_to_bf16(param):
       return param
 
     ga_params = jax.tree_util.tree_map(convert_to_bf16, params)
-    ga_params = jax.tree.map(jax.lax.with_sharding_constraint, ga_params, params_shardings)
   else:
     ga_params = params
 
+  ga_params = jax.tree.map(_maybe_shard_with_name, ga_params, ga_params_shardings)
   grad_func = jax.value_and_grad(_loss_fn, argnums=4, has_aux=True)
 
   def accumulate_gradient(acc_grad_and_loss, data):
     ga_params = acc_grad_and_loss["ga_params"]
-
     (_, aux), cur_batch_gradient = grad_func(model, config, data, dropout_rng, ga_params, *extra_dpo_args, is_train=True)
     acc_grad_and_loss["loss"] += aux["total_loss"]
     acc_grad_and_loss["moe_lb_loss"] += aux["moe_lb_loss"]
@@ -94,7 +108,7 @@ def reshape_to_microbatch_accumulations(batch_arr):
 
   data = jax.tree_util.tree_map(reshape_to_microbatch_accumulations, data)
   init_grad = jax.tree_util.tree_map(jnp.zeros_like, ga_params)
-  init_grad = jax.tree.map(jax.lax.with_sharding_constraint, init_grad, params_shardings)
+  init_grad = jax.tree.map(_maybe_shard_with_name, init_grad, grad_shardings)
   init_grad_and_loss = {
       "loss": 0.0,
       "grad": init_grad,
@@ -113,9 +127,23 @@ def reshape_to_microbatch_accumulations(batch_arr):
       + grad_and_loss["mtp_loss"] / config.gradient_accumulation_steps
   )
   raw_grads = grad_and_loss["grad"]
-  if config.shard_optimizer_over_data:
-    raw_grads = jax.tree.map(jax.lax.with_sharding_constraint, raw_grads, params_shardings)
+  raw_grads = jax.tree.map(_maybe_shard_with_name, raw_grads, params_shardings)
   raw_grads = jax.tree_util.tree_map(lambda arr: arr / grad_and_loss["total_weights"], raw_grads)
   aux = jax.tree.map(lambda x: jnp.sum(x, axis=0), aux)  # pytype: disable=module-attr
 
   return loss, aux, raw_grads
+
+
+# GA helper functions
+def update_sharding_for_reduced(sharding: NamedSharding) -> NamedSharding:
+  """
+  Add reduced on data axis of given NamedSharding
+  """
+  return sharding.update(spec=sharding.spec.update(reduced={"data"}))
+
+
+def update_sharding_for_unreduced(sharding: NamedSharding) -> NamedSharding:
+  """
+  Add unreduced on data axis of given NamedSharding
+  """
+  return sharding.update(spec=sharding.spec.update(unreduced={"data"}))
@@ -18,7 +18,7 @@
 from typing import Any, Optional, Tuple
 
 from jax.ad_checkpoint import checkpoint_name
-from jax.sharding import Mesh
+from jax.sharding import Mesh, NamedSharding
 import jax.numpy as jnp
 
 from flax import linen as nn
@@ -663,6 +663,7 @@ def __call__(
       inputs_kv: Array,
       inputs_positions: Array | None = None,
       decoder_segment_ids: Array | None = None,
+      out_sharding: NamedSharding | None = None,
       *,
       model_mode: str = MODEL_MODE_TRAIN,
       deterministic: bool = False,
 
@@ -28,7 +28,7 @@
 from jax.experimental.pallas.ops.gpu import attention as gpu_pallas_attention
 from jax.experimental.pallas.ops.gpu import decode_attention as gpu_pallas_decode_attention
 from jax.experimental import pallas as pl
-from jax.sharding import Mesh
+from jax.sharding import Mesh, NamedSharding
 import jax.numpy as jnp
 
 from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_kernel
@@ -44,6 +44,7 @@
 
 
 from MaxText import max_utils
+from MaxText.maxtext_utils import maybe_shard_with_name
 from MaxText.common_types import (
     DEFAULT_MASK_VALUE,
     BATCH,
@@ -1302,12 +1303,26 @@ def wrap_flash_attention(
           )
       return attention_output
 
+    def _maybe_shard_with_pspec(inputs, pspec: jax.sharding.PartitionSpec | None):
+      # decoder_segment_ids can be None
+      if pspec is None:
+        return None
+      sharding = NamedSharding(self.mesh, pspec)
+      return maybe_shard_with_name(inputs, sharding, shard_mode=self.config.shard_mode)
+
+    query = _maybe_shard_with_pspec(query, axis_names_q)
+    key = _maybe_shard_with_pspec(key, axis_names_kv)
+    value = _maybe_shard_with_pspec(value, axis_names_kv)
+    decoder_segment_ids_q = _maybe_shard_with_pspec(decoder_segment_ids, segment_axis_names_q)
+    decoder_segment_ids_kv = _maybe_shard_with_pspec(decoder_segment_ids, segment_axis_names_kv)
+    sinks = _maybe_shard_with_pspec(sinks, sink_axis_names)
+
     x = wrap_flash_attention(
         query,
         key,
         value,
-        decoder_segment_ids,
-        decoder_segment_ids,
+        decoder_segment_ids_q,
+        decoder_segment_ids_kv,
         splash_kernel,
         cp_size,
         load_balanced_context_parallel,