Removed kernel_regeneration_util from Mosaic

superbobry · jax authors · commit f8473509cff0 · 2024-06-11T02:36:41.000-07:00
It was only used for persisting kernel metadata, and that can be done via
jax.named_scope instead.

PiperOrigin-RevId: 642195336
diff --git a/jax/BUILD b/jax/BUILD
@@ -626,7 +626,6 @@ pytype_strict_library(
         ":pallas",  # build_cleaner: keep
         ":tpu_custom_call",
         "//jax/_src/pallas/mosaic:core",
-        "//jax/_src/pallas/mosaic:kernel_regeneration_util",
         "//jax/_src/pallas/mosaic:lowering",
         "//jax/_src/pallas/mosaic:pallas_call_registration",  # build_cleaner: keep
         "//jax/_src/pallas/mosaic:pipeline",
diff --git a/jax/_src/pallas/mosaic/BUILD b/jax/_src/pallas/mosaic/BUILD
@@ -77,14 +77,6 @@ py_library(
     ] + py_deps("numpy"),
 )
 
-py_library(
-    name = "kernel_regeneration_util",
-    srcs = ["kernel_regeneration_util.py"],
-    deps = [
-        "//jax:mlir",
-    ],
-)
-
 py_library(
     name = "pipeline",
     srcs = ["pipeline.py"],
diff --git a/jax/_src/pallas/mosaic/kernel_regeneration_util.py b/jax/_src/pallas/mosaic/kernel_regeneration_util.py
diff --git a/jax/_src/pallas/mosaic/pallas_call_registration.py b/jax/_src/pallas/mosaic/pallas_call_registration.py
@@ -78,9 +78,6 @@ def pallas_call_tpu_lowering_rule(
     mlir_ctx.load_all_available_dialects()
     tpu.register_dialect(mlir_ctx)
     dimension_semantics = mosaic_params.get("dimension_semantics", None)
-    kernel_regeneration_metadata = mosaic_params.get(
-        "kernel_regeneration_metadata"
-    )
     mosaic_module, extra_args = lowering.lower_jaxpr_to_module(
         mlir_ctx, grid_mapping, in_shapes, out_shapes, jaxpr,
         dimension_semantics=dimension_semantics, mesh=mesh)
@@ -101,11 +98,10 @@ def _lower_fun(*args):
         out_avals,
         backend=ctx.module_context.backend,
         kernel_name=name,
-        kernel_regeneration_metadata=kernel_regeneration_metadata,
-        cost_estimate=mosaic_params.get("cost_estimate", None),
-        vmem_limit_bytes=mosaic_params.get("vmem_limit_bytes", None),
-        flags=mosaic_params.get("flags", None),
-        allow_input_fusion=mosaic_params.get("allow_input_fusion", None),
+        cost_estimate=mosaic_params.get("cost_estimate"),
+        vmem_limit_bytes=mosaic_params.get("vmem_limit_bytes"),
+        flags=mosaic_params.get("flags"),
+        allow_input_fusion=mosaic_params.get("allow_input_fusion"),
         input_output_aliases=input_output_aliases,
     )(
         *dynamic_grid_args,
diff --git a/jax/_src/tpu_custom_call.py b/jax/_src/tpu_custom_call.py
@@ -195,7 +195,6 @@ def _tpu_custom_call_lowering(
     *in_nodes,  # pylint: disable=missing-function-docstring
     config: CustomCallBackendConfig,
     kernel_name: str | None,
-    kernel_regeneration_metadata: bytes | None,
     out_avals: Any,
     input_output_aliases: tuple[tuple[int, int], ...],
 ) -> ...:
@@ -248,15 +247,11 @@ def _tpu_custom_call_lowering(
       ]),
   )
 
-  # Add kernel_name and kernel_regeneration_metadata as attributes to the
-  # custom call op. This is because we do not want to pollute the backend_config
-  # with this information.
+  # Add kernel_name and kernel_metadata as attributes to the custom call op.
+  # This is because we do not want to pollute the backend_config with this
+  # information.
   if kernel_name is not None:
     call.attributes["kernel_name"] = ir.StringAttr.get(kernel_name)
-  if kernel_regeneration_metadata is not None:
-    call.attributes["kernel_regeneration_metadata"] = ir.StringAttr.get(
-        base64.b64encode(kernel_regeneration_metadata)
-    )
   if multiple_results:
     results = [stablehlo.get_tuple_element(call, mlir.i32_attr(i))
                for i in range(len(out_avals))]
@@ -376,7 +371,6 @@ def as_tpu_kernel(
     backend: str | xla_client.Client = "tpu",
     device_type: str | None = None,
     kernel_name: str | None = None,
-    kernel_regeneration_metadata: bytes | None = None,
     vmem_limit_bytes: int | None = None,
     flags: dict[str, bool | int | float] | None = None,
     allow_input_fusion: list[bool] | None = None,
@@ -435,7 +429,6 @@ def as_tpu_kernel(
       has_communication=has_communication,
       has_custom_barrier=has_custom_barrier,
       kernel_name=kernel_name,
-      kernel_regeneration_metadata=kernel_regeneration_metadata,
       cost_estimate=cost_estimate,
       vmem_limit_bytes=vmem_limit_bytes,
       flags=flags,
@@ -455,7 +448,6 @@ def _lowered_as_tpu_kernel(
     has_communication: bool = False,
     has_custom_barrier: bool = False,
     kernel_name: str | None = None,
-    kernel_regeneration_metadata: bytes | None = None,
     vmem_limit_bytes: int | None = None,
     flags: dict[str, bool | int | float] | None = None,
     allow_input_fusion: list[bool] | None = None,
@@ -496,7 +488,6 @@ def apply_kernel(*args, collective_id: int | None = None):
         *args,
         config=config,
         kernel_name=kernel_name,
-        kernel_regeneration_metadata=kernel_regeneration_metadata,
         out_avals=out_avals,
         input_output_aliases=input_output_aliases,
     )
diff --git a/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py b/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py
@@ -16,10 +16,11 @@
 
 from __future__ import annotations
 
+from collections.abc import Mapping
 import dataclasses
 import enum
 import functools
-from typing import Any, Callable, Literal, NamedTuple, Union, Optional, overload
+from typing import Any, Callable, Literal, NamedTuple, Optional, Union, overload
 
 import jax
 from jax import ad_checkpoint
@@ -89,10 +90,13 @@ class SegmentIds(NamedTuple):
 
 
 def get_kernel_name(
-    is_mqa: bool, save_residuals: bool, is_segmented: bool, phase: str
+    block_metadata: Mapping[str, Any],
+    is_mqa: bool,
+    save_residuals: bool,
+    is_segmented: bool,
+    phase: str,
 ) -> str:
   """Returns a unique name for all SplashAttention kernel variants."""
-
   assert phase == "dq" or phase == "dkv" or phase == "fwd"
   # Saving residuals is supported only for the fwd phase.
   assert not save_residuals or phase == "fwd"
@@ -103,7 +107,9 @@ def get_kernel_name(
     residuals = "_no_residuals"
   attention_type = "mqa" if is_mqa else "mha"
   segments = "_segmented" if is_segmented else ""
-  return f"splash_{attention_type}_{phase}{segments}{residuals}"
+  return f"splash_{attention_type}_{phase}{segments}{residuals}_" + "_".join(
+      f"{k}={v}" for k, v in sorted(block_metadata.items())
+  )
 
 
 # Reference attention implementations
@@ -1054,28 +1060,17 @@ def logsumexp_index_map(h, i, *_):
     out_shapes += [None]
     out_specs += [None]
 
-  # Attach useful metadata to the custom-call HLO op.
-  # Having this information available in an HLO-dump or xprof is valuable for
-  # debugging and performance investigation.
-  metadata_dict = dict(
-      block_sizes=dataclasses.asdict(block_sizes),
-      is_mqa=is_mqa,
-      save_residuals=save_residuals,
-      mask_value=mask_value,
-      is_segmented=segment_ids is not None,
-      attn_logits_soft_cap=attn_logits_soft_cap,
-      residual_checkpoint_name=residual_checkpoint_name,
-  )
-
-  mosaic_params = pltpu.encode_kernel_regeneration_metadata(metadata_dict)
-
-  mosaic_params.update(
+  mosaic_params = dict(
       dimension_semantics=("parallel", "arbitrary", "arbitrary"),
       flags={"XLA_TPU_FORCE_LP_LLO_SCHEDULER": True},
   )
 
   kernel_name = get_kernel_name(
-      is_mqa, save_residuals, segment_ids is not None, "fwd"
+      dataclasses.asdict(block_sizes),
+      is_mqa=is_mqa,
+      save_residuals=save_residuals,
+      is_segmented=segment_ids is not None,
+      phase="fwd",
   )
 
   if fwd_mask_info.data_next is not None:
@@ -1526,28 +1521,24 @@ def logsumexp_index_map(h, i, *_):
   )
   num_scalar_prefetch = 3
 
-  # Attach useful metadata to the custom-call HLO op.
-  # Having this information available in an HLO-dump or xprof is valuable for
-  # debugging and performance investigation.
-  metadata_dict = dict(
-      block_q_dq=bq,
-      block_kv_dq=bkv,
-      q_layout=q_layout,
-      k_layout=k_layout,
-      v_layout=v_layout,
-      is_mqa=is_mqa,
-      mask_value=mask_value,
-      is_segmented=segment_ids is not None,
-      attn_logits_soft_cap=attn_logits_soft_cap,
-  )
-
-  mosaic_params = pltpu.encode_kernel_regeneration_metadata(metadata_dict)
-  mosaic_params.update(
+  mosaic_params = dict(
       dimension_semantics=("arbitrary", "arbitrary", "arbitrary"),
       flags={"XLA_TPU_FORCE_LP_LLO_SCHEDULER": True},
   )
 
-  kernel_name = get_kernel_name(is_mqa, False, segment_ids is not None, "dq")
+  kernel_name = get_kernel_name(
+      dict(
+          block_q_dq=bq,
+          block_kv_dq=bkv,
+          q_layout=q_layout,
+          k_layout=k_layout,
+          v_layout=v_layout,
+      ),
+      is_mqa=is_mqa,
+      save_residuals=False,
+      is_segmented=segment_ids is not None,
+      phase="dq",
+  )
   with jax.named_scope(kernel_name):
     _, dq = pl.pallas_call(
         kernel,
@@ -2072,35 +2063,30 @@ def logsumexp_index_map(
   )
   num_scalar_prefetch = 3
 
-  # Attach useful metadata to the custom-call HLO op.
-  # Having this information available in an HLO-dump or xprof is valuable for
-  # debugging and performance investigation.
-  metadata_dict = dict(
-      block_q_dkv=bq,
-      block_kv_dkv=bkv,
-      block_kv_dkv_compute=bkv_compute,
-      q_layout=q_layout,
-      k_layout=k_layout,
-      v_layout=v_layout,
-      use_fused_bwd_kernel=use_fused_bwd_kernel,
-      is_mqa=is_mqa,
-      mask_value=mask_value,
-      is_segmented=segment_ids is not None,
-      attn_logits_soft_cap=attn_logits_soft_cap,
-  )
-
-  mosaic_params = pltpu.encode_kernel_regeneration_metadata(metadata_dict)
   # We set all dimensions to arbitrary because:
   # 1) for kv_seq_len, the splash attention prefetch schedule assumes no
   #    megacore
   # 2) for heads, we are reducing over heads
   # 3) for q_seq_len, we are reducing over it to compute dkv
-  mosaic_params.update(
+  mosaic_params = dict(
       dimension_semantics=("arbitrary", "arbitrary", "arbitrary"),
       flags={"XLA_TPU_FORCE_LP_LLO_SCHEDULER": True},
   )
 
-  kernel_name = get_kernel_name(is_mqa, False, segment_ids is not None, "dkv")
+  kernel_name = get_kernel_name(
+      dict(
+          block_q_dkv=bq,
+          block_kv_dkv=bkv,
+          block_kv_dkv_compute=bkv_compute,
+          q_layout=q_layout,
+          k_layout=k_layout,
+          v_layout=v_layout,
+      ),
+      is_mqa=is_mqa,
+      save_residuals=False,
+      is_segmented=segment_ids is not None,
+      phase="dkv",
+  )
   with jax.named_scope(kernel_name):
     _, _, _, dq_unreduced, dk, dv = pl.pallas_call(
         kernel,
diff --git a/jax/experimental/pallas/tpu.py b/jax/experimental/pallas/tpu.py
@@ -20,8 +20,6 @@
 from jax._src.pallas.mosaic.core import semaphore
 from jax._src.pallas.mosaic.core import SemaphoreType
 from jax._src.pallas.mosaic.core import TPUMemorySpace
-from jax._src.pallas.mosaic.kernel_regeneration_util import encode_kernel_regeneration_metadata
-from jax._src.pallas.mosaic.kernel_regeneration_util import extract_kernel_regeneration_metadata
 from jax._src.pallas.mosaic.lowering import LoweringException
 from jax._src.pallas.mosaic.pipeline import BufferedRef
 from jax._src.pallas.mosaic.pipeline import emit_pipeline