Revert "all 9 flags"

khatwanimohit · khatwanimohit · commit ecc3506d22cf · 2025-11-12T00:19:42.000Z
This reverts commit 82f327b.
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -178,15 +178,9 @@ use_random_routing: False # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: True # whether to use a custom sort vjp for sparse matmul ops
 use_ring_of_experts: False # whether to use ring of experts for sparse matmul expert parallelism
 # Tunable tiling dimensions used for Megablox
-tile_fwd_batch_seq: 512
-tile_fwd_embed_dim: 1024
-tile_fwd_mlp_dim: 1024
-tile_dlhs_fwd_batch_seq: 512
-tile_dlhs_fwd_embed_dim: 1024
-tile_dlhs_fwd_mlp_dim: 1024
-tile_drhs_fwd_batch_seq: 512
-tile_drhs_fwd_embed_dim: 1024
-tile_drhs_fwd_mlp_dim: 1024
+tile_batch_seq: 512
+tile_embed_dim: 1024
+tile_mlp_dim: 1024
 norm_topk_prob: False # Boolean to enable the top-k probability normalization. Qwen3-specific normalization of router weights.
 
 # How the expert axis is used to shard attention weights and activations
diff --git a/src/MaxText/kernels/megablox/ops.py b/src/MaxText/kernels/megablox/ops.py
@@ -17,33 +17,27 @@
 # pylint: disable=too-many-positional-arguments
 
 import functools
-import itertools
-import dataclasses
 from typing import Literal
 import jax
 import jax.numpy as jnp
-from jax.experimental.xla_metadata import set_xla_metadata
-from MaxText.kernels.megablox import backend as megablox_backend
-# from tokamax._src.ops.ragged_dot import pallas_mosaic_tpu_kernel as tokamax_backend
-import tokamax
+from MaxText.kernels.megablox import backend
 import qwix
 import qwix.pallas as qpl
 
-_counter = itertools.count()
+
 def gmm(
     lhs: jnp.ndarray,
     rhs: jnp.ndarray,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
-    tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
+    tiling: tuple[int, int, int] = (128, 128, 128),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
     lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     use_qwix_quantization: bool = False,
-    use_tokamax_backend: bool = False,
 ):
   """Grouped matrix multiplication operation."""
   quantization_rule = None
@@ -76,7 +70,6 @@ def gmm(
       transpose_rhs,
       interpret,
       quantization_rule,
-      use_tokamax_backend,
   )
 
 
@@ -85,13 +78,12 @@ def _gmm_fwd(
     rhs: jnp.ndarray,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
-    tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
+    tiling: tuple[int, int, int] = (128, 128, 128),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
     quantization_rule: qwix.QtRule | None = None,
-    use_tokamax_backend: bool = False,
 ) -> tuple[
     jnp.ndarray,
     tuple[
@@ -102,17 +94,15 @@ def _gmm_fwd(
     ],
 ]:
   """Forward function for GMM VJP."""
-  fwd_counter = next(_counter)
   if quantization_rule:
     if quantization_rule.act_qtype:
-      with set_xla_metadata(MUST_FUSE=fwd_counter):
-        lhs = qpl.quantize(
-            lhs,
-            quantization_rule.act_qtype,
-            channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
-            calibration_method=quantization_rule.act_calibration_method,
-            scale_dtype=jnp.float32,
-        )
+      lhs = qpl.quantize(
+          lhs,
+          quantization_rule.act_qtype,
+          channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
+          calibration_method=quantization_rule.act_calibration_method,
+          scale_dtype=jnp.float32,
+      )
     if quantization_rule.weight_qtype:
       rhs = qpl.quantize(
           rhs,
@@ -124,50 +114,29 @@ def _gmm_fwd(
           calibration_method=quantization_rule.weight_calibration_method,
           scale_dtype=jnp.float32,
       )
-      # QAG is only supported for following conditions
-      if quantization_rule.weight_calibration_method.startswith("fixed") and isinstance(rhs, qpl.QArray):
-        rhs_qvalue = jax.lax.all_gather(rhs.qvalue, "fsdp", axis=0, tiled=True)
-        rhs = dataclasses.replace(rhs, qvalue=rhs_qvalue)
-  if use_tokamax_backend:
-    with set_xla_metadata(MUST_FUSE=fwd_counter):
-      out = tokamax.ragged_dot_general(
-        lhs=lhs,
-        rhs=rhs,
-        group_sizes=group_sizes,
-        ragged_dot_dimension_numbers=tokamax.RaggedDotDimensionNumbers(
-          dot_dimension_numbers=(([1], [1]), ([], [])),
-          lhs_ragged_dimensions=[0],
-          rhs_group_dimensions=[0],
-        ),
-        precision=jax.lax.Precision.DEFAULT,
-        preferred_element_type=preferred_element_type,
-        group_offset=group_offset,
-        implementation="mosaic",
-      )
-  else:
-    out = megablox_backend.gmm(
+
+  out = backend.gmm(
       lhs,
       rhs,
       group_sizes,
       preferred_element_type,
-      tiling[:3],
+      tiling,
       group_offset,
       existing_out,
       transpose_rhs=transpose_rhs,
       interpret=interpret,
-    )
+  )
   return out, (lhs, rhs, group_sizes, group_offset)
 
 
 def _gmm_bwd(
     lhs_dtype: jax.typing.DTypeLike,
     rhs_dtype: jax.typing.DTypeLike,
     preferred_element_type: jnp.dtype,
-    tiling: tuple[int, int, int, int, int, int, int, int, int],
+    tiling: tuple[int, int, int],
     transpose_rhs: bool,
     interpret: bool,
     quantization_rule: qwix.QtRule | None,
-    use_tokamax_backend: bool,
     residual: tuple[
         jnp.ndarray | qpl.QArray,
         jnp.ndarray | qpl.QArray,
@@ -191,8 +160,6 @@ def _gmm_bwd(
   #  - drhs_dout: the incoming gradient used to calculate drhs.
 
   # dlhs_dout and drhs_dout can be different when quantization is enabled.
-  dlhs_counter = next(_counter)
-  drhs_counter = next(_counter)
   dlhs_dout = grad
   drhs_dout = grad
   if isinstance(rhs, qpl.QArray):  # qvalue: [g, k, n] scale: [1, 1, n]
@@ -206,76 +173,41 @@ def _gmm_bwd(
     lhs = lhs.qvalue
   if quantization_rule and quantization_rule.bwd_qtype:
     # Enable backward pass quantization
-    with set_xla_metadata(MUST_FUSE=dlhs_counter):
-      dlhs_dout = qpl.quantize(
-          dlhs_dout,
-          quantization_rule.bwd_qtype,
-          channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
-          calibration_method=quantization_rule.bwd_calibration_method,
-          scale_dtype=jnp.float32,
-      )
-    with set_xla_metadata(MUST_FUSE=drhs_counter):
-      drhs_dout = qpl.quantize(
-          drhs_dout,
-          quantization_rule.bwd_qtype,
-          channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [1],
-          calibration_method=quantization_rule.bwd_calibration_method,
-          scale_dtype=jnp.float32,
-      )
-  if use_tokamax_backend:
-    with set_xla_metadata(MUST_FUSE=dlhs_counter):
-      dlhs = tokamax.ragged_dot_general(
-        lhs=dlhs_dout,
-        rhs=rhs,
-        group_sizes=group_sizes,
-        ragged_dot_dimension_numbers=jax.lax.RaggedDotDimensionNumbers(
-            dot_dimension_numbers=(([1], [2]), ([], [])),
-            lhs_ragged_dimensions=[0],
-            rhs_group_dimensions=[0],
-        ),
-        precision=jax.lax.Precision.DEFAULT,
-        preferred_element_type=preferred_element_type,
-        group_offset=group_offset,
-        implementation="mosaic",
-      )
-    drhs = tokamax.tgmm(
-      lhs.swapaxes(0, 1),
-      drhs_dout,
-      group_sizes=group_sizes,
-      ragged_dot_dimension_numbers=jax.lax.RaggedDotDimensionNumbers(
-          dot_dimension_numbers=(([0], [0]), ([], [])),
-          lhs_ragged_dimensions=[0],
-          rhs_group_dimensions=[],
-      ),
-      precision=jax.lax.Precision.DEFAULT,
-      preferred_element_type=preferred_element_type,
-      group_offset=group_offset,
-      implementation="mosaic",
-    )
-  else:
-    dlhs = megablox_backend.gmm(
+    dlhs_dout = qpl.quantize(
         dlhs_dout,
-        rhs,
-        group_sizes,
-        lhs_dtype,
-        tiling[3:6],
-        group_offset,
-        transpose_rhs=not transpose_rhs,
-        interpret=interpret,
+        quantization_rule.bwd_qtype,
+        channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [0],
+        calibration_method=quantization_rule.bwd_calibration_method,
+        scale_dtype=jnp.float32,
     )
-    drhs = megablox_backend.tgmm(
-        lhs.swapaxes(0, 1),
+    drhs_dout = qpl.quantize(
         drhs_dout,
-        group_sizes,
-        rhs_dtype,
-        tiling[-3:],
-        group_offset,
-        num_actual_groups,
-        interpret=interpret,
+        quantization_rule.bwd_qtype,
+        channelwise_axes=[] if quantization_rule.disable_channelwise_axes else [1],
+        calibration_method=quantization_rule.bwd_calibration_method,
+        scale_dtype=jnp.float32,
     )
 
-  if quantization_rule and quantization_rule.bwd_qtype:
-    drhs = jax.lax.psum_scatter(drhs, "fsdp", scatter_dimension=0, tiled=True)
+  dlhs = backend.gmm(
+      dlhs_dout,
+      rhs,
+      group_sizes,
+      lhs_dtype,
+      tiling,
+      group_offset,
+      transpose_rhs=not transpose_rhs,
+      interpret=interpret,
+  )
+  drhs = backend.tgmm(
+      lhs.swapaxes(0, 1),
+      drhs_dout,
+      group_sizes,
+      rhs_dtype,
+      tiling,
+      group_offset,
+      num_actual_groups,
+      interpret=interpret,
+  )
 
   # NOTE: If the rhs transposition is fused into the forward pass we need to
   # return the transpose of the rhs gradient that we calculated above.
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -810,12 +810,6 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
           min(tiling[0], m),
           min(tiling[1], k),
           min(tiling[2], n),
-          min(tiling[3], m),
-          min(tiling[4], k),
-          min(tiling[5], n),
-          min(tiling[6], m),
-          min(tiling[7], k),
-          min(tiling[8], n),
       )
       if self.config.use_tokamax_gmm:
         output = tokamax_api.ragged_dot(
@@ -826,19 +820,6 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
             preferred_element_type=self.dtype,
             implementation="mosaic",
         )
-      elif self.config.use_tokamax_gmm and self.config.quantization:
-        # call mblx gmm with tokamax quantization
-        output = mblx.gmm(
-          lhs=inputs,
-          rhs=kernel,
-          group_sizes=group_sizes,
-          preferred_element_type=self.dtype,
-          tiling=tiling,
-          lhs_quantize_dtype=lhs_quantize_dtype,
-          rhs_quantize_dtype=rhs_quantize_dtype,
-          use_qwix_quantization=self.config.use_qwix_quantization,
-          use_tokamax_backend=self.config.use_tokamax_gmm,
-        )
       else:
         if self.config.megablox:
           output = mblx.gmm(
@@ -850,7 +831,6 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
               lhs_quantize_dtype=lhs_quantize_dtype,
               rhs_quantize_dtype=rhs_quantize_dtype,
               use_qwix_quantization=self.config.use_qwix_quantization,
-              use_tokamax_backend=self.config.use_tokamax_gmm,
           )
         else:
           rhs_inputs = kernel
@@ -1061,26 +1041,14 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
           expert_assignments=selected_experts,
       )
       wi_tile_size = (
-          self.config.tile_fwd_batch_seq,
-          self.config.tile_fwd_embed_dim,
-          self.config.tile_fwd_mlp_dim,
-          self.config.tile_dlhs_batch_seq,
-          self.config.tile_dlhs_embed_dim,
-          self.config.tile_dlhs_mlp_dim,
-          self.config.tile_drhs_batch_seq,
-          self.config.tile_drhs_embed_dim,
-          self.config.tile_drhs_mlp_dim,
+          self.config.tile_batch_seq,
+          self.config.tile_embed_dim,
+          self.config.tile_mlp_dim,
       )
       wo_tile_size = (
-          self.config.tile_fwd_batch_seq,
-          self.config.tile_fwd_mlp_dim,
-          self.config.tile_fwd_embed_dim,
-          self.config.tile_dlhs_batch_seq,
-          self.config.tile_dlhs_mlp_dim,
-          self.config.tile_dlhs_embed_dim,
-          self.config.tile_drhs_batch_seq,
-          self.config.tile_drhs_mlp_dim,
-          self.config.tile_drhs_embed_dim,
+          self.config.tile_batch_seq,
+          self.config.tile_mlp_dim,
+          self.config.tile_embed_dim,
       )
       layer_w0 = gmm_fn(x, w0, tiling=wi_tile_size)
       if self.get_tensor_transpose_parallelism_size() > 1: