[Mosaic GPU] Add support for all sensible conversions involving f8

apaszke · Google-ML-Automation · commit 09e023e4a71e · 2026-01-23T06:18:09.000-08:00
PiperOrigin-RevId: 860078929
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -2070,9 +2070,11 @@ def astype(
     i8 = ir.IntegerType.get_signless(8)
     i16 = ir.IntegerType.get_signless(16)
     i32 = ir.IntegerType.get_signless(32)
-    bf16 = ir.BF16Type.get()
     f32 = ir.F32Type.get()
+    f16 = ir.F16Type.get()
+    bf16 = ir.BF16Type.get()
     f8e4m3fn = ir.Float8E4M3FNType.get()
+    f8e5m2 = ir.Float8E5M2Type.get()
     f8e8m0fnu = ir.Float8E8M0FNUType.get()
 
     cur_dtype = self.mlir_dtype
@@ -2430,23 +2432,21 @@ def pairwise_convert(do_convert):
           _registers=new_registers, _layout=self.layout, _is_signed=is_signed
       )
 
+    # Here we handle all conversions involving f8 types.
+    # TODO(apaszke): Figure out proper satfinite and rounding modes.
+    supported_f8_f16 = {f8e4m3fn: f16, f8e5m2: f16, f8e8m0fnu: bf16}
+    f8_ptx_names = {f8e4m3fn: "e4m3", f8e5m2: "e5m2", f8e8m0fnu: "ue8m0"}
+    f16_ptx_names = {f16: "f16", bf16: "bf16"}
+    f8_types = f8_ptx_names.keys()
+    f16_types = f16_ptx_names.keys()
     if f8e8m0fnu in {cur_dtype, new_dtype} and utils.get_arch().major < 10:
       raise ValueError(
           "f8e8m0fnu type only supported on Blackwell and newer GPUs"
       )
-    if cur_dtype == f8e8m0fnu and new_dtype == bf16:
-      def do_convert(pair_vec):
-        return llvm.inline_asm(
-            i32,
-            [utils.bitcast(pair_vec, i16)],
-            "cvt.rn.bf16x2.ue8m0x2 $0, $1;",
-            "=r,h",
-        )
-      return pairwise_convert(do_convert)
-    # TODO(bchetioui): handle conversions to/from other float8 types.
-    if cur_dtype == f32 and new_dtype in {f8e4m3fn, f8e8m0fnu}:
-      tgt_ty = "e4m3" if new_dtype == f8e4m3fn else "ue8m0"
-      rounding = "rn" if new_dtype == f8e4m3fn else "rz"
+    # f8 <-> f32
+    if cur_dtype == f32 and new_dtype in f8_types:
+      name_8 = f8_ptx_names[new_dtype]
+      rounding = "rz" if new_dtype == f8e8m0fnu else "rn"
       def do_convert(pair_vec):
         e0, e1 = (
             vector.extract(pair_vec, dynamic_position=[], static_position=[i])
@@ -2455,16 +2455,51 @@ def do_convert(pair_vec):
         return llvm.inline_asm(
             i16,
             [e1, e0],
-            f"cvt.{rounding}.satfinite.{tgt_ty}x2.f32 $0, $1, $2;",
+            f"cvt.{rounding}.satfinite.{name_8}x2.f32 $0, $1, $2;",
             "=h,r,r",
         )
       return pairwise_convert(do_convert)
-
-    if cur_dtype == f8e8m0fnu and new_dtype == f32:
-      return self.astype(bf16).astype(f32)
-    if cur_dtype == bf16 and new_dtype == f8e4m3fn:
-      # There are no instructions to convert bf16 to f8e4m3fn directly.
-      return self.astype(f32).astype(f8e4m3fn)
+    # No f8 type supports direct conversion to f32, so we go via 16-bit floats.
+    if cur_dtype in f8_types and new_dtype == f32:
+      return self.astype(supported_f8_f16[cur_dtype]).astype(f32)
+    # f8 <-> f16
+    if new_dtype in f8_types and cur_dtype == supported_f8_f16[new_dtype]:
+      name_16 = f16_ptx_names[cur_dtype]
+      name_8 = f8_ptx_names[new_dtype]
+      rounding = "rz" if new_dtype == f8e8m0fnu else "rn"
+      ptx = f"cvt.{rounding}.satfinite.{name_8}x2.{name_16}x2 $0, $1;"
+      def do_convert(pair_vec):
+        return llvm.inline_asm(i16, [utils.bitcast(pair_vec, i32)], ptx, "=h,r")
+      return pairwise_convert(do_convert)
+    if cur_dtype in f8_types and new_dtype == supported_f8_f16[cur_dtype]:
+      name_8 = f8_ptx_names[cur_dtype]
+      name_16 = f16_ptx_names[new_dtype]
+      ptx = f"cvt.rn.{name_16}x2.{name_8}x2 $0, $1;"
+      def do_convert(pair_vec):
+        return llvm.inline_asm(i32, [utils.bitcast(pair_vec, i16)], ptx, "=r,h")
+      return pairwise_convert(do_convert)
+    # We don't emulate the unsupported f8 <-> f16 conversions, but rather force
+    # the user to go via f32 to let them know it's expensive.
+    if (new_dtype in f8_types and cur_dtype in f16_types) or (
+        new_dtype in f16_types and cur_dtype in f8_types
+    ):
+      # Remap the 16-bit type to the supported one.
+      ok_cur_dtype = supported_f8_f16.get(new_dtype, cur_dtype)
+      ok_new_dtype = supported_f8_f16.get(cur_dtype, new_dtype)
+      raise NotImplementedError(
+          f"Hardware has no support for converting from {cur_dtype} to"
+          f" {new_dtype} (only cast from {ok_cur_dtype} to {ok_new_dtype} is"
+          " supported). Cast to f32 first and then to the target type"
+          " (expensive, but sufficient)."
+      )
+    # Repack through a shared 16-bit type.
+    if cur_dtype in f8_types and new_dtype in f8_types:
+      if supported_f8_f16[cur_dtype] == supported_f8_f16[new_dtype]:
+        return self.astype(supported_f8_f16[cur_dtype]).astype(new_dtype)
+      raise NotImplementedError(
+          f"Conversion from {cur_dtype} to {new_dtype} must go through f32,"
+          " which is expensive. Cast to f32 explicitly if you really want it."
+      )
 
     # Generic path.
     from_float = isinstance(cur_dtype, ir.FloatType)
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -662,56 +662,6 @@ def kernel(ctx, inp, out, smem):
     f = mgpu.as_gpu_kernel(kernel, (1, 1, 1), (128, 1, 1), x, y, (x, y))
     np.testing.assert_array_equal(f(x), y)
 
-  @parameterized.parameters(
-      (jnp.float32, jnp.float8_e4m3fn),
-      (jnp.bfloat16, jnp.float8_e4m3fn)
-  )
-  def test_f8_conversions(self, jax_dtype_from, jax_dtype_to):
-    mlir_dtype_to = utils.dtype_to_ir_type(jax_dtype_to)
-    def kernel(ctx, inp, out, smem):
-      del ctx
-      smem_from, smem_to = smem
-      copy(inp, smem_from, swizzle=128)
-      t = mgpu.FragmentedArray.load_tiled(
-          smem_from,
-          swizzle=128,
-          is_signed=None,
-          layout=fa.WGMMA_LAYOUT,
-      )
-      t = t.astype(mlir_dtype_to, is_signed=utils.is_signed(jax_dtype_to))
-      t.store_tiled(smem_to, swizzle=128)
-      copy(smem_to, out, swizzle=128)
-
-    # These generative shenanigans are to ensure that we don't generate values
-    # that are too large for the target type. That is because the saturation
-    # behavior of the conversion is different between XLA and Mosaic GPU here
-    # (to use the NVIDIA internal, we allow Mosaic GPU to use the .satfinite
-    # modifier, which saturates to the largest finite value---while XLA would
-    # give us NaNs in this case).
-    max_finite_val = 0b111_1110
-
-    expected = jax.lax.bitcast_convert_type(
-        jax.random.randint(
-            jax.random.key(42),
-            (1, 1, 64, 128),
-            -max_finite_val,
-            max_finite_val + 1,
-            dtype=jnp.uint8,
-        ),
-        jax_dtype_to,
-    )
-    x = expected.astype(jax_dtype_from)
-
-    res = mgpu.as_gpu_kernel(
-        kernel,
-        (1, 1, 1),
-        (128, 1, 1),
-        x,
-        expected,
-        (x, expected),
-    )(x)
-    np.testing.assert_array_equal(res, expected)
-
   @parameterized.product(
       jax_dtype_from_to=(
           (jnp.int8, jnp.bfloat16),
@@ -3473,30 +3423,33 @@ def kernel(ctx, dst, _):
     np.testing.assert_array_equal(result, op(iota, rhs).astype(jnp.int8))
 
   @parameterized.product(
-      # TODO(apaszke): Add float16, float8_e5m2
-      jax_dtype_from=(jnp.float32, jnp.bfloat16, jnp.float8_e4m3fn, jnp.float8_e8m0fnu),
-      jax_dtype_to=(jnp.float32, jnp.bfloat16, jnp.float8_e4m3fn, jnp.float8_e8m0fnu),
-      # Test different vector lengths.
+      # TODO(apaszke): Add float16
+      jax_dtype_from=(jnp.float32, jnp.bfloat16, jnp.float8_e5m2, jnp.float8_e4m3fn, jnp.float8_e8m0fnu),
+      jax_dtype_to=(jnp.float32, jnp.bfloat16, jnp.float8_e5m2, jnp.float8_e4m3fn, jnp.float8_e8m0fnu),
       vec_len=(1, 2, 4, 8),
   )
   def test_conversion_f8_(self, jax_dtype_from, jax_dtype_to, vec_len):
     from_bitwidth = jnp.finfo(jax_dtype_from).bits
     to_bitwidth = jnp.finfo(jax_dtype_to).bits
     if from_bitwidth > 8 and to_bitwidth > 8:
       self.skipTest("At least one of the types should be 8-bit")
-    if from_bitwidth == to_bitwidth == 8:
-      self.skipTest("f8 <-> f8 conversions unimplemented")
+    if jax_dtype_from == jax_dtype_to:
+      self.skipTest("Identical types, so nothing to test")
     if jnp.float8_e8m0fnu in {
         jax_dtype_from,
         jax_dtype_to,
     } and not jtu.is_cuda_compute_capability_at_least("10.0"):
       self.skipTest("f8e8m0fnu not supported on pre-Blackwell GPUs")
-    unimplemented = [
-        (jnp.float8_e4m3fn, jnp.bfloat16),
-        (jnp.float8_e4m3fn, jnp.float32),
-        (jnp.bfloat16, jnp.float8_e8m0fnu),
-    ]
-    if (jax_dtype_from, jax_dtype_to) in unimplemented:
+    if from_bitwidth == to_bitwidth == 8 and {jax_dtype_from, jax_dtype_to} != {
+        jnp.float8_e4m3fn, jnp.float8_e5m2,
+    }:
+      self.skipTest("An unimplemented f8 <-> f8 conversion")
+    unimplemented = {
+        frozenset((jnp.float8_e4m3fn, jnp.bfloat16)),
+        frozenset((jnp.float8_e5m2, jnp.bfloat16)),
+        frozenset((jnp.float8_e8m0fnu, jnp.float16)),
+    }
+    if {jax_dtype_from, jax_dtype_to} in unimplemented:
       self.skipTest("Unimplemented")
     layout = fa.tmem_native_layout(vec_len)
     mlir_dtype_to = utils.dtype_to_ir_type(jax_dtype_to)
@@ -3516,7 +3469,10 @@ def kernel(ctx, inp, out, smem):
     bits = self.prng.integers(
         low=sample_iinfo.min, high=sample_iinfo.max, size=(m, n), dtype=np.int32
     ).astype(int_sample_dtype)
-    values = jax.lax.bitcast_convert_type(bits, narrow_type).astype(jax_dtype_from)
+    values = jax.lax.bitcast_convert_type(bits, narrow_type)
+    # A bunch of conversions are only supported for finite values.
+    values = values.at[jnp.isinf(values)].set(jnp.finfo(narrow_type).max)
+    values = values.astype(jax_dtype_from)
 
     expected = values.astype(jax_dtype_to)
     res = mgpu.as_gpu_kernel(
@@ -3860,44 +3816,44 @@ def kernel(ctx, dst, _):
   )
   @jtu.thread_unsafe_test()
   def test_max(self, vec_size, dtype):
-      def kernel(ctx, src, src2, dst, _):
-        is_signed = utils.is_signed(dtype)
-        src = fa.FragmentedArray.load_strided(src, vec_size=vec_size, is_signed=is_signed)
-        src2 = fa.FragmentedArray.load_strided(src2, vec_size=vec_size, is_signed=is_signed)
-        src.max(src2).store_untiled(dst)
-      x = self.prng.uniform(-1, 1, (12 * 128,)).astype(dtype)
-      y = self.prng.uniform(-1, 1, (12 * 128,)).astype(dtype)
-      f = mgpu.as_gpu_kernel(
+    def kernel(ctx, src, src2, dst, _):
+      is_signed = utils.is_signed(dtype)
+      src = fa.FragmentedArray.load_strided(src, vec_size=vec_size, is_signed=is_signed)
+      src2 = fa.FragmentedArray.load_strided(src2, vec_size=vec_size, is_signed=is_signed)
+      src.max(src2).store_untiled(dst)
+    x = self.prng.uniform(-1, 1, (12 * 128,)).astype(dtype)
+    y = self.prng.uniform(-1, 1, (12 * 128,)).astype(dtype)
+    f = mgpu.as_gpu_kernel(
           kernel, (1, 1, 1), (128, 1, 1), (x, y), x, ()
       )
-      with jtu.set_env(MOSAIC_GPU_DUMP_PTX="1"), self.capture_stdout() as ptx:
-        z = f(x, y).block_until_ready()
-      if dtype == jnp.float32:
-        dtype_short = "f32"
-      elif dtype == jnp.float16:
-        dtype_short = "f16"
-      elif dtype == jnp.bfloat16:
-        dtype_short = "bf16"
-      elif jnp.issubdtype(dtype, jnp.signedinteger):
-        dtype_short = f"s{dtypes.itemsize_bits(dtype)}"
-      elif jnp.issubdtype(dtype, jnp.unsignedinteger):
-        dtype_short = f"u{dtypes.itemsize_bits(dtype)}"
-      else:
-        raise NotImplementedError(f"Unsupported dtype: {dtype}")
-      ptx = ptx()
-      nan_modifier = ".NaN" if jnp.issubdtype(dtype, jnp.floating) else ""
-      instr = f"max{nan_modifier}.{dtype_short} "
-      instr_double = f"max{nan_modifier}.{dtype_short}x2 "
-      single_converts = ptx.count(instr)
-      double_converts = ptx.count(instr_double)
-      self.assertEqual(128 * (single_converts + 2 * double_converts), 12 * 128)
-      if vec_size % 2:
-        self.assertGreater(single_converts, 0)
-      elif dtypes.itemsize_bits(dtype) < 32:
-        # This, together with the assertion above, implies that all converts
-        # happened through doubled operations.
-        self.assertEqual(single_converts, 0)
-      np.testing.assert_array_equal(z, np.maximum(x, y))
+    with jtu.set_env(MOSAIC_GPU_DUMP_PTX="1"), self.capture_stdout() as ptx:
+      z = f(x, y).block_until_ready()
+    if dtype == jnp.float32:
+      dtype_short = "f32"
+    elif dtype == jnp.float16:
+      dtype_short = "f16"
+    elif dtype == jnp.bfloat16:
+      dtype_short = "bf16"
+    elif jnp.issubdtype(dtype, jnp.signedinteger):
+      dtype_short = f"s{dtypes.itemsize_bits(dtype)}"
+    elif jnp.issubdtype(dtype, jnp.unsignedinteger):
+      dtype_short = f"u{dtypes.itemsize_bits(dtype)}"
+    else:
+      raise NotImplementedError(f"Unsupported dtype: {dtype}")
+    ptx = ptx()
+    nan_modifier = ".NaN" if jnp.issubdtype(dtype, jnp.floating) else ""
+    instr = f"max{nan_modifier}.{dtype_short} "
+    instr_double = f"max{nan_modifier}.{dtype_short}x2 "
+    single_converts = ptx.count(instr)
+    double_converts = ptx.count(instr_double)
+    self.assertEqual(128 * (single_converts + 2 * double_converts), 12 * 128)
+    if vec_size % 2:
+      self.assertGreater(single_converts, 0)
+    elif dtypes.itemsize_bits(dtype) < 32:
+      # This, together with the assertion above, implies that all converts
+      # happened through doubled operations.
+      self.assertEqual(single_converts, 0)
+    np.testing.assert_array_equal(z, np.maximum(x, y))
 
   def test_splat_layout(self):
     m, n = 64, 8