Update configs for tokamax

RissyRan · RissyRan · commit a24092eaa38c · 2025-10-30T04:11:47.000Z
diff --git a/src/MaxText/layers/attention_op.py b/src/MaxText/layers/attention_op.py
@@ -20,7 +20,6 @@
 import math
 
 import numpy as np
-from packaging import version
 
 import jax
 from jax import lax
@@ -1286,21 +1285,18 @@ def wrap_flash_attention(
           decoder_segment_ids_tuple = splash_attention_kernel.SegmentIds(decoder_segment_ids_q, decoder_segment_ids_kv)
       else:
         decoder_segment_ids_tuple = None
-      # TODO(ranran): remove if/else branch once b/441336842 is fixed
-      if version.parse(jax.__version__) < version.parse("0.7.2.dev20250824"):
-        attention_output = jax.vmap(splash_kernel)(query, key, value, decoder_segment_ids_tuple)
-      else:
-        if self.config.use_tokamax_splash:
-          if max_logit_value is not None:
-            attention_output = jax.vmap(partial(splash_kernel, max_logit_value=max_logit_value))(
-                query, key, value, decoder_segment_ids_tuple
-            )
-          else:
-            attention_output = jax.vmap(splash_kernel)(query, key, value, decoder_segment_ids_tuple)
-        else:
-          attention_output = jax.vmap(splash_kernel, in_axes=(0, 0, 0, 0, None))(
-              query, key, value, decoder_segment_ids_tuple, sinks
+
+      if self.config.use_tokamax_splash:
+        if max_logit_value is not None:
+          attention_output = jax.vmap(partial(splash_kernel, max_logit_value=max_logit_value))(
+              query, key, value, decoder_segment_ids_tuple
           )
+        else:
+          attention_output = jax.vmap(splash_kernel)(query, key, value, decoder_segment_ids_tuple)
+      else:
+        attention_output = jax.vmap(splash_kernel, in_axes=(0, 0, 0, 0, None))(
+            query, key, value, decoder_segment_ids_tuple, sinks
+        )
       return attention_output
 
     def _maybe_shard_with_pspec(inputs, pspec: jax.sharding.PartitionSpec | None):
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -36,8 +36,7 @@
 from MaxText.layers import attentions, linears, quantizations, nnx_wrappers
 from MaxText.layers.initializers import NdInitializer, nd_dense_init, default_bias_init, variable_to_logically_partitioned
 
-if jax.__version__ >= "0.8.0":
-  from tokamax._src.ops.ragged_dot import api as tokamax_api
+from tokamax._src.ops.ragged_dot import api as tokamax_api
 
 set_xla_metadata = xla_metadata.set_xla_metadata
 
@@ -809,17 +808,17 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
           min(tiling[1], k),
           min(tiling[2], n),
       )
-      if self.config.megablox:
-        if self.config.use_tokamax_gmm:
-          output = tokamax_api.ragged_dot(  #  pylint: disable=possibly-used-before-assignment
-              lhs=inputs,
-              rhs=kernel,
-              group_sizes=group_sizes,
-              precision=jax.lax.Precision.DEFAULT,
-              preferred_element_type=self.dtype,
-              implementation="mosaic",
-          )
-        else:
+      if self.config.use_tokamax_gmm:
+        output = tokamax_api.ragged_dot(
+            lhs=inputs,
+            rhs=kernel,
+            group_sizes=group_sizes,
+            precision=jax.lax.Precision.DEFAULT,
+            preferred_element_type=self.dtype,
+            implementation="mosaic",
+        )
+      else:
+        if self.config.megablox:
           output = mblx.gmm(
               lhs=inputs,
               rhs=kernel,
@@ -830,29 +829,29 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
               rhs_quantize_dtype=rhs_quantize_dtype,
               use_qwix_quantization=self.config.use_qwix_quantization,
           )
-      else:
-        rhs_inputs = kernel
-        if isinstance(kernel, aqt.QTensor):
-          if kernel.bias or kernel.sparsity_mask or len(kernel.scale) > 1:
-            raise ValueError("Unsupported usecase for ragged_dot with quantized kernel.")
-          rhs_inputs = kernel.qvalue
-        with set_xla_metadata(ragged_dot_tiling=",".join([str(t) for t in tiling])):
-          output = jax.lax.ragged_dot(
-              lhs=inputs,
-              rhs=rhs_inputs,
-              group_sizes=group_sizes,
-              preferred_element_type=self.dtype,
-          )
-        if isinstance(kernel, aqt.QTensor):
-          # Multiply outputs by the kernely scale
-          scales = jnp.take(kernel.scale[0].squeeze(), indices=expert_assignments, axis=0)
-          if padding_amount > 0:
-            scales = jax.lax.pad(
-                scales,
-                jnp.array(0.0, dtype=scales.dtype),
-                [(0, padding_amount, 0), (0, 0, 0)],
+        else:
+          rhs_inputs = kernel
+          if isinstance(kernel, aqt.QTensor):
+            if kernel.bias or kernel.sparsity_mask or len(kernel.scale) > 1:
+              raise ValueError("Unsupported usecase for ragged_dot with quantized kernel.")
+            rhs_inputs = kernel.qvalue
+          with set_xla_metadata(ragged_dot_tiling=",".join([str(t) for t in tiling])):
+            output = jax.lax.ragged_dot(
+                lhs=inputs,
+                rhs=rhs_inputs,
+                group_sizes=group_sizes,
+                preferred_element_type=self.dtype,
             )
-          output *= scales
+          if isinstance(kernel, aqt.QTensor):
+            # Multiply outputs by the kernely scale
+            scales = jnp.take(kernel.scale[0].squeeze(), indices=expert_assignments, axis=0)
+            if padding_amount > 0:
+              scales = jax.lax.pad(
+                  scales,
+                  jnp.array(0.0, dtype=scales.dtype),
+                  [(0, padding_amount, 0), (0, 0, 0)],
+              )
+            output *= scales
       if padding_amount > 0:
         output = output[: hs_shape[0]]
       return output