all 9 flags

khatwanimohit · khatwanimohit · commit 82f327b8671e · 2025-11-12T00:18:21.000Z
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -178,9 +178,15 @@ use_random_routing: False # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: True # whether to use a custom sort vjp for sparse matmul ops
 use_ring_of_experts: False # whether to use ring of experts for sparse matmul expert parallelism
 # Tunable tiling dimensions used for Megablox
-tile_batch_seq: 512
-tile_embed_dim: 1024
-tile_mlp_dim: 1024
+tile_fwd_batch_seq: 512
+tile_fwd_embed_dim: 1024
+tile_fwd_mlp_dim: 1024
+tile_dlhs_fwd_batch_seq: 512
+tile_dlhs_fwd_embed_dim: 1024
+tile_dlhs_fwd_mlp_dim: 1024
+tile_drhs_fwd_batch_seq: 512
+tile_drhs_fwd_embed_dim: 1024
+tile_drhs_fwd_mlp_dim: 1024
 norm_topk_prob: False # Boolean to enable the top-k probability normalization. Qwen3-specific normalization of router weights.
 
 # How the expert axis is used to shard attention weights and activations
diff --git a/src/MaxText/kernels/megablox/ops.py b/src/MaxText/kernels/megablox/ops.py
@@ -17,27 +17,33 @@
 # pylint: disable=too-many-positional-arguments
 
 import functools
+import itertools
+import dataclasses
 from typing import Literal
 import jax
 import jax.numpy as jnp
-from MaxText.kernels.megablox import backend
+from jax.experimental.xla_metadata import set_xla_metadata
+from MaxText.kernels.megablox import backend as megablox_backend
+# from tokamax._src.ops.ragged_dot import pallas_mosaic_tpu_kernel as tokamax_backend
+import tokamax
 import qwix
 import qwix.pallas as qpl
 
-
+_counter = itertools.count()
 def gmm(
     lhs: jnp.ndarray,
     rhs: jnp.ndarray,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
-    tiling: tuple[int, int, int] = (128, 128, 128),
+    tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
     lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     use_qwix_quantization: bool = False,
+    use_tokamax_backend: bool = False,
 ):
   """Grouped matrix multiplication operation."""
   quantization_rule = None
@@ -70,6 +76,7 @@ def gmm(
       transpose_rhs,
       interpret,
       quantization_rule,
+      use_tokamax_backend,
   )
 
 
@@ -78,12 +85,13 @@ def _gmm_fwd(
     rhs: jnp.ndarray,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
-    tiling: tuple[int, int, int] = (128, 128, 128),
+    tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
     quantization_rule: qwix.QtRule | None = None,
+    use_tokamax_backend: bool = False,
 ) -> tuple[
     jnp.ndarray,
     tuple[
@@ -94,15 +102,17 @@ def _gmm_fwd(
     ],
 ]:
   """Forward function for GMM VJP."""
+  fwd_counter = next(_counter)
   if quantization_rule:
     if quantization_rule.act_qtype:
-      lhs = qpl.quantize(
-          lhs,
-          quantization_rule.act_qtype,
-          channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
-          calibration_method=quantization_rule.act_calibration_method,
-          scale_dtype=jnp.float32,
-      )
+      with set_xla_metadata(MUST_FUSE=fwd_counter):
+        lhs = qpl.quantize(
+            lhs,
+            quantization_rule.act_qtype,
+            channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
+            calibration_method=quantization_rule.act_calibration_method,
+            scale_dtype=jnp.float32,
+        )
     if quantization_rule.weight_qtype:
       rhs = qpl.quantize(
           rhs,
@@ -114,29 +124,50 @@ def _gmm_fwd(
           calibration_method=quantization_rule.weight_calibration_method,
           scale_dtype=jnp.float32,
       )
-
-  out = backend.gmm(
+      # QAG is only supported for following conditions
+      if quantization_rule.weight_calibration_method.startswith("fixed") and isinstance(rhs, qpl.QArray):
+        rhs_qvalue = jax.lax.all_gather(rhs.qvalue, "fsdp", axis=0, tiled=True)
+        rhs = dataclasses.replace(rhs, qvalue=rhs_qvalue)
+  if use_tokamax_backend:
+    with set_xla_metadata(MUST_FUSE=fwd_counter):
+      out = tokamax.ragged_dot_general(
+        lhs=lhs,
+        rhs=rhs,
+        group_sizes=group_sizes,
+        ragged_dot_dimension_numbers=tokamax.RaggedDotDimensionNumbers(
+          dot_dimension_numbers=(([1], [1]), ([], [])),
+          lhs_ragged_dimensions=[0],
+          rhs_group_dimensions=[0],
+        ),
+        precision=jax.lax.Precision.DEFAULT,
+        preferred_element_type=preferred_element_type,
+        group_offset=group_offset,
+        implementation="mosaic",
+      )
+  else:
+    out = megablox_backend.gmm(
       lhs,
       rhs,
       group_sizes,
       preferred_element_type,
-      tiling,
+      tiling[:3],
       group_offset,
       existing_out,
       transpose_rhs=transpose_rhs,
       interpret=interpret,
-  )
+    )
   return out, (lhs, rhs, group_sizes, group_offset)
 
 
 def _gmm_bwd(
     lhs_dtype: jax.typing.DTypeLike,
     rhs_dtype: jax.typing.DTypeLike,
     preferred_element_type: jnp.dtype,
-    tiling: tuple[int, int, int],
+    tiling: tuple[int, int, int, int, int, int, int, int, int],
     transpose_rhs: bool,
     interpret: bool,
     quantization_rule: qwix.QtRule | None,
+    use_tokamax_backend: bool,
     residual: tuple[
         jnp.ndarray | qpl.QArray,
         jnp.ndarray | qpl.QArray,
@@ -160,6 +191,8 @@ def _gmm_bwd(
   #  - drhs_dout: the incoming gradient used to calculate drhs.
 
   # dlhs_dout and drhs_dout can be different when quantization is enabled.
+  dlhs_counter = next(_counter)
+  drhs_counter = next(_counter)
   dlhs_dout = grad
   drhs_dout = grad
   if isinstance(rhs, qpl.QArray):  # qvalue: [g, k, n] scale: [1, 1, n]
@@ -173,41 +206,76 @@ def _gmm_bwd(
     lhs = lhs.qvalue
   if quantization_rule and quantization_rule.bwd_qtype:
     # Enable backward pass quantization
-    dlhs_dout = qpl.quantize(
+    with set_xla_metadata(MUST_FUSE=dlhs_counter):
+      dlhs_dout = qpl.quantize(
+          dlhs_dout,
+          quantization_rule.bwd_qtype,
+          channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
+          calibration_method=quantization_rule.bwd_calibration_method,
+          scale_dtype=jnp.float32,
+      )
+    with set_xla_metadata(MUST_FUSE=drhs_counter):
+      drhs_dout = qpl.quantize(
+          drhs_dout,
+          quantization_rule.bwd_qtype,
+          channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [1],
+          calibration_method=quantization_rule.bwd_calibration_method,
+          scale_dtype=jnp.float32,
+      )
+  if use_tokamax_backend:
+    with set_xla_metadata(MUST_FUSE=dlhs_counter):
+      dlhs = tokamax.ragged_dot_general(
+        lhs=dlhs_dout,
+        rhs=rhs,
+        group_sizes=group_sizes,
+        ragged_dot_dimension_numbers=jax.lax.RaggedDotDimensionNumbers(
+            dot_dimension_numbers=(([1], [2]), ([], [])),
+            lhs_ragged_dimensions=[0],
+            rhs_group_dimensions=[0],
+        ),
+        precision=jax.lax.Precision.DEFAULT,
+        preferred_element_type=preferred_element_type,
+        group_offset=group_offset,
+        implementation="mosaic",
+      )
+    drhs = tokamax.tgmm(
+      lhs.swapaxes(0, 1),
+      drhs_dout,
+      group_sizes=group_sizes,
+      ragged_dot_dimension_numbers=jax.lax.RaggedDotDimensionNumbers(
+          dot_dimension_numbers=(([0], [0]), ([], [])),
+          lhs_ragged_dimensions=[0],
+          rhs_group_dimensions=[],
+      ),
+      precision=jax.lax.Precision.DEFAULT,
+      preferred_element_type=preferred_element_type,
+      group_offset=group_offset,
+      implementation="mosaic",
+    )
+  else:
+    dlhs = megablox_backend.gmm(
         dlhs_dout,
-        quantization_rule.bwd_qtype,
-        channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
-        calibration_method=quantization_rule.bwd_calibration_method,
-        scale_dtype=jnp.float32,
+        rhs,
+        group_sizes,
+        lhs_dtype,
+        tiling[3:6],
+        group_offset,
+        transpose_rhs=not transpose_rhs,
+        interpret=interpret,
     )
-    drhs_dout = qpl.quantize(
+    drhs = megablox_backend.tgmm(
+        lhs.swapaxes(0, 1),
         drhs_dout,
-        quantization_rule.bwd_qtype,
-        channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [1],
-        calibration_method=quantization_rule.bwd_calibration_method,
-        scale_dtype=jnp.float32,
+        group_sizes,
+        rhs_dtype,
+        tiling[-3:],
+        group_offset,
+        num_actual_groups,
+        interpret=interpret,
     )
 
-  dlhs = backend.gmm(
-      dlhs_dout,
-      rhs,
-      group_sizes,
-      lhs_dtype,
-      tiling,
-      group_offset,
-      transpose_rhs=not transpose_rhs,
-      interpret=interpret,
-  )
-  drhs = backend.tgmm(
-      lhs.swapaxes(0, 1),
-      drhs_dout,
-      group_sizes,
-      rhs_dtype,
-      tiling,
-      group_offset,
-      num_actual_groups,
-      interpret=interpret,
-  )
+  if quantization_rule and quantization_rule.bwd_qtype:
+    drhs = jax.lax.psum_scatter(drhs, "fsdp", scatter_dimension=0, tiled=True)
 
   # NOTE: If the rhs transposition is fused into the forward pass we need to
   # return the transpose of the rhs gradient that we calculated above.
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -810,6 +810,12 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
           min(tiling[0], m),
           min(tiling[1], k),
           min(tiling[2], n),
+          min(tiling[3], m),
+          min(tiling[4], k),
+          min(tiling[5], n),
+          min(tiling[6], m),
+          min(tiling[7], k),
+          min(tiling[8], n),
       )
       if self.config.use_tokamax_gmm:
         output = tokamax_api.ragged_dot(
@@ -820,6 +826,19 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
             preferred_element_type=self.dtype,
             implementation="mosaic",
         )
+      elif self.config.use_tokamax_gmm and self.config.quantization:
+        # call mblx gmm with tokamax quantization
+        output = mblx.gmm(
+          lhs=inputs,
+          rhs=kernel,
+          group_sizes=group_sizes,
+          preferred_element_type=self.dtype,
+          tiling=tiling,
+          lhs_quantize_dtype=lhs_quantize_dtype,
+          rhs_quantize_dtype=rhs_quantize_dtype,
+          use_qwix_quantization=self.config.use_qwix_quantization,
+          use_tokamax_backend=self.config.use_tokamax_gmm,
+        )
       else:
         if self.config.megablox:
           output = mblx.gmm(
@@ -831,6 +850,7 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
               lhs_quantize_dtype=lhs_quantize_dtype,
               rhs_quantize_dtype=rhs_quantize_dtype,
               use_qwix_quantization=self.config.use_qwix_quantization,
+              use_tokamax_backend=self.config.use_tokamax_gmm,
           )
         else:
           rhs_inputs = kernel
@@ -1041,14 +1061,26 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
           expert_assignments=selected_experts,
       )
       wi_tile_size = (
-          self.config.tile_batch_seq,
-          self.config.tile_embed_dim,
-          self.config.tile_mlp_dim,
+          self.config.tile_fwd_batch_seq,
+          self.config.tile_fwd_embed_dim,
+          self.config.tile_fwd_mlp_dim,
+          self.config.tile_dlhs_batch_seq,
+          self.config.tile_dlhs_embed_dim,
+          self.config.tile_dlhs_mlp_dim,
+          self.config.tile_drhs_batch_seq,
+          self.config.tile_drhs_embed_dim,
+          self.config.tile_drhs_mlp_dim,
       )
       wo_tile_size = (
-          self.config.tile_batch_seq,
-          self.config.tile_mlp_dim,
-          self.config.tile_embed_dim,
+          self.config.tile_fwd_batch_seq,
+          self.config.tile_fwd_mlp_dim,
+          self.config.tile_fwd_embed_dim,
+          self.config.tile_dlhs_batch_seq,
+          self.config.tile_dlhs_mlp_dim,
+          self.config.tile_dlhs_embed_dim,
+          self.config.tile_drhs_batch_seq,
+          self.config.tile_drhs_mlp_dim,
+          self.config.tile_drhs_embed_dim,
       )
       layer_w0 = gmm_fn(x, w0, tiling=wi_tile_size)
       if self.get_tensor_transpose_parallelism_size() > 1: