[pallas] Move the hardware_generation query in the code path that needs it

gnecula · jax authors · commit 2ade7e75265a · 2024-06-10T03:13:36.000-07:00
This change allows us to lower and export Pallas calls even
on machines that do not have TPUs, in many cases.

PiperOrigin-RevId: 641841079
diff --git a/jax/_src/tpu_custom_call.py b/jax/_src/tpu_custom_call.py
@@ -384,16 +384,12 @@ def as_tpu_kernel(
 ) -> Callable[..., Any]:
   """Turns an MLIR Mosaic kernel into a JAX-compatible function."""
   # We use jax.jit to make sure we hit the fast compilation cache.
-  some_tpu = jax.devices(backend)[0]
-  device_kind = some_tpu.device_kind
-  if not device_kind.startswith("TPU v"):
-    raise ValueError(f"Unrecognized TPU device kind: {device_kind}.")
+
   if vmem_limit_bytes is not None and not isinstance(vmem_limit_bytes, int):
     raise ValueError(
         "vmem_limit_bytes must be an int: provided with a"
         f" {type(vmem_limit_bytes)}."
     )
-  hardware_generation = int(device_kind[len("TPU v")])
   has_communication, has_custom_barrier = tpu.private_has_communication(
       module.operation
   )
@@ -405,6 +401,14 @@ def as_tpu_kernel(
         module.operation.get_asm(binary=True, enable_debug_info=True)
     )
     if needs_layout_passes and _MOSAIC_USE_PYTHON_PIPELINE.value:
+      some_tpu = jax.devices(backend)[0]
+      device_kind = some_tpu.device_kind
+      if not device_kind.startswith("TPU v"):
+        raise ValueError(
+            f"Unrecognized TPU device kind: {device_kind}. "
+            "tpu_custom_call cannot be lowered on a machine without TPUs "
+            "when mosaic_use_python_pipeline=True.")
+      hardware_generation = int(device_kind[len("TPU v")])
       module = _lower_tpu_kernel(module, hardware_generation)
       needs_hlo_passes = False
       needs_layout_passes = False
diff --git a/tests/pallas/BUILD b/tests/pallas/BUILD
@@ -330,3 +330,32 @@ jax_test(
         "//jax:pallas_gpu",  # build_cleaner: keep
     ],
 )
+
+jax_test(
+    name = "export_pallas_test",
+    srcs = ["export_pallas_test.py"],
+    config_tags_overrides = {
+        "gpu_a100_x32": {
+            "ondemand": False,  # Include in presubmit.
+        },
+    },
+    disable_configs = [
+        "gpu",
+        "gpu_x32",
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_p100",
+        "gpu_p100_x32",
+        "gpu_pjrt_c_api",
+    ],
+    enable_configs = [
+        "gpu_a100_x32",
+    ],
+    tags = [],
+    deps = [
+        "//jax:pallas",
+        "//jax:pallas_gpu",  # build_cleaner: keep
+        "//jax:pallas_tpu",  # build_cleaner: keep
+        "//jax/experimental/export",
+    ],
+)
diff --git a/tests/pallas/export_pallas_test.py b/tests/pallas/export_pallas_test.py
@@ -0,0 +1,56 @@
+# Copyright 2023 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test exporting Pallas kernels."""
+
+from absl.testing import absltest
+import jax
+from jax._src import test_util as jtu
+from jax.experimental import export
+# Import mosaic for flag definitions
+from jax.experimental import mosaic as _  # noqa: F401
+from jax.experimental import pallas as pl
+import numpy as np
+
+
+jax.config.parse_flags_with_absl()
+
+
+class ExportTest(jtu.JaxTestCase):
+
+  def test_cross_platform(self):
+    def add_vectors_kernel(x_ref, y_ref, o_ref):
+      x, y = x_ref[...], y_ref[...]
+      o_ref[...] = x + y
+
+    @jax.jit
+    def add_vectors(x: jax.Array, y: jax.Array) -> jax.Array:
+      return pl.pallas_call(add_vectors_kernel,
+                            out_shape=jax.ShapeDtypeStruct(x.shape, x.dtype)
+                            )(x, y)
+
+    a = np.arange(8)
+    exp = export.export(
+        add_vectors,
+        # TODO(necula): Make this test work on GPU also
+        lowering_platforms=["tpu"],
+    )(a, a)
+
+    if jtu.device_under_test() == "tpu":
+      res = export.call(exp)(a, a)
+      self.assertAllClose(res, a + a)
+
+
+if __name__ == '__main__':
+  absltest.main(testLoader=jtu.JaxTestLoader())