added fp8 code and tests

Alexander · Alexander · commit c3f1b70e7bc1 · 2025-11-02T12:12:28.000+01:00
diff --git a/mpx/_dtypes.py b/mpx/_dtypes.py
@@ -42,10 +42,11 @@ def set_forward_backward_precision(forward_datatype, backward_datatype):
     global EXPERIMENTAL_ACTIVATED
     global FORWARD_PRECISION_DATATYPE
     global BACKWARD_PRECISION_DATATYPE
-    logging.warning("Setting forward precision is an experimental feature and may lead to unexpected behavior.")
+    logging.warning("Setting forward backward precision is an experimental feature and may lead to unexpected behavior.")
     EXPERIMENTAL_ACTIVATED = True
     FORWARD_PRECISION_DATATYPE = forward_datatype
     BACKWARD_PRECISION_DATATYPE = backward_datatype
+    assert backward_datatype == jnp.float32, "Currently only float32 is supported as backward datatype."
 
 def forward_datatype():
     assert EXPERIMENTAL_ACTIVATED, "Experimental features not activated. Call set_forward_backward_precision first."
diff --git a/mpx/experimental/__init__.py b/mpx/experimental/__init__.py
@@ -1,32 +1,6 @@
-"""
-Mixed Precision for JAX - A library for mixed precision training in JAX
-"""
-
-__version__ = "0.1.7"
-
-from ._cast import (
-    cast_tree,
-    cast_to_float32,
-    cast_to_float16,
-    cast_to_bfloat16,
-    cast_to_full_precision,
-    cast_to_half_precision,
-    force_full_precision,
-    cast_function,
-)
-from ._dtypes import half_precision_datatype, set_half_precision_datatype, HALF_PRECISION_DATATYPE, set_forward_backward_precision, forward_datatype, backward_datatype  # , FLOAT16_MAX, BFLOAT16_MAX
-from ._loss_scaling import DynamicLossScaling, all_finite, scaled
-from ._grad_tools import select_tree, filter_grad, filter_value_and_grad, optimizer_update, calculate_scaled_grad
-
+from ._cast import cast_function_fwd_bwd
 
 __all__ = [
     # Cast functions
-    'cast_tree',
-    'cast_to_float32',
-    'cast_to_float16',
-    'cast_to_bfloat16',
-    'cast_to_full_precision',
-    'cast_to_half_precision',
-    'force_full_precision',
-    'cast_function',
+    'cast_function_fwd_bwd',
 ]
diff --git a/mpx/experimental/_cast.py b/mpx/experimental/_cast.py
@@ -13,43 +13,44 @@
 from jaxtyping import Array, Float, Int, PyTree, PRNGKeyArray, ArrayLike
 
 from .._dtypes import forward_datatype, backward_datatype
-from .._cast import cast_tree
+from .._cast import cast_function
 
 
 def max_val(dtype):
     return (jnp.finfo(dtype).max).astype(jnp.float32)
 
-@partial(jax.custom_vjp, nondiff_argnames=("dtype8", 'dimension_numbers', 'precision', 'preferred_element_type', 'out_sharding'))
-def quantized_multiplication(a: ArrayLike, b: ArrayLike, dtype8, dimension_numbers, precision, preferred_element_type, out_sharding):
+@partial(jax.custom_vjp, nondiff_argnames=('dimension_numbers', 'precision', 'preferred_element_type', 'out_sharding'))
+def quantized_multiplication(a: ArrayLike, b: ArrayLike, dimension_numbers, precision, preferred_element_type, out_sharding):
     a_max = jnp.max(jnp.abs(a))
     b_max = jnp.max(jnp.abs(b))
-    max_dtype = max_val(dtype8)
+    fwd_dtype = forward_datatype()
+    max_dtype = max_val(fwd_dtype)
     scaling_a = max_dtype / (a_max + 1e-8)
     scaling_b = max_dtype / (b_max + 1e-8)
 
-    a_q = (a * scaling_a).astype(dtype8)
-    b_q = (b * scaling_b).astype(dtype8)
+    a_q = (a * scaling_a).astype(fwd_dtype)
+    b_q = (b * scaling_b).astype(fwd_dtype)
 
     result_q = jax.lax.dot_general_p.bind(a_q, b_q, dimension_numbers=dimension_numbers, precision=precision, preferred_element_type=preferred_element_type, out_sharding=out_sharding)
-
-    result = (result_q.astype(jnp.float32)) / (scaling_a * scaling_b)
+    result = (result_q.astype(backward_datatype())) / (scaling_a * scaling_b)
     return result
 
 
-def quantized_multiplication_fwd(a: ArrayLike, b: ArrayLike, dtype8, dimension_numbers, precision, preferred_element_type, out_sharding):
+def quantized_multiplication_fwd(a: ArrayLike, b: ArrayLike, dimension_numbers, precision, preferred_element_type, out_sharding):
     a_max = jnp.max(jnp.abs(a))
     b_max = jnp.max(jnp.abs(b))
-    max_dtype = max_val(dtype8)
+    fwd_dtype = forward_datatype()
+    max_dtype = max_val(fwd_dtype)
     scaling_a = max_dtype / (a_max + 1e-8)
     scaling_b = max_dtype / (b_max + 1e-8)
 
-    a_q = (a * scaling_a).astype(dtype8)
-    b_q = (b * scaling_b).astype(dtype8)
+    a_q = (a * scaling_a).astype(fwd_dtype)
+    b_q = (b * scaling_b).astype(fwd_dtype)
     # we want to save the quantized versions for the backward pass to save memory
-    return quantized_multiplication(a, b, dtype8, dimension_numbers, precision, preferred_element_type, out_sharding), (a_q, b_q, scaling_a, scaling_b)
+    return quantized_multiplication(a, b, dimension_numbers, precision, preferred_element_type, out_sharding), (a_q, b_q, scaling_a, scaling_b)
 
 # f_bwd :: (c, CT b) -> CT a
-def quantized_multiplication_bwd(dtype8, dimension_numbers, precision, preferred_element_type, out_sharding, c, dy_dc):
+def quantized_multiplication_bwd(dimension_numbers, precision, preferred_element_type, out_sharding, c, dy_dc):
   a_q, b_q, scaling_a, scaling_b = c
   backward_dtype = backward_datatype()
   # backward is performed in fp32 TODO allow to change it.
@@ -62,41 +63,24 @@ def quantized_multiplication_bwd(dtype8, dimension_numbers, precision, preferred
 
 quantized_multiplication.defvjp(quantized_multiplication_fwd, quantized_multiplication_bwd)
 
+
 @quax.register(jax.lax.dot_general_p)
 def _(lhs: ArrayLike, rhs: ArrayLike, **params):
-    return quantized_multiplication(lhs, rhs, jnp.float8_e4m3, **params)
+    return quantized_multiplication(lhs, rhs, **params)
 
 
-
-def cast_function(func, dtype, return_dtype=None):
+def cast_function_fwd_bwd(f: callable) -> callable:
     """
-    Casts the function to the specified data type.
+    Casts a function to use the specified forward and backward data types.
+    Args:
+        f (callable): The function to be cast.
+    Returns:
+        callable: A new function that uses the specified data types for forward and backward passes.
     """
 
-    if return_dtype is None:
-        return_dtype = dtype
-
-    def wrapper(*args, **kwargs):
-        args_cast = []
-        for arg in args:
-            args_cast.append(cast_tree(arg, dtype))
-        args_cast = tuple(args_cast)
-
-        kwargs_cast = {}
-        for key, value in kwargs.items():
-            kwargs_cast[key] = cast_tree(value, dtype)
-
-        results = func(*args_cast, **kwargs_cast)
-
-        if type(results) == tuple:
-            results_converted = []
-            for r in results:
-                results_converted.append(cast_tree(r, return_dtype))
-            return tuple(results_converted)
-        elif eqx.is_array(results):
-            return cast_tree(results, return_dtype)
-        return results
-    
-    return wrapper
+    # cast inuts to bwd_dtype. This makes all non multiply operations to be in bwd_dtype
+    f = cast_function(f, backward_datatype())
 
+    f = quax.quaxify(f)
 
+    return f
diff --git a/tests/test_fp8.py b/tests/test_fp8.py
@@ -0,0 +1,80 @@
+import unittest
+import jax
+import jax.numpy as jnp
+import equinox as eqx
+from jaxtyping import Array, Float, Int, PyTree
+import numpy as np
+
+from mpx import set_forward_backward_precision
+
+from mpx.experimental import cast_function_fwd_bwd
+
+
+class MLP(eqx.Module):
+    a: Array
+    b: Array
+
+    def __init__(self):
+        self.a = jnp.ones((10, 10), dtype=jnp.float32)
+        self.b = jnp.ones(10, dtype=jnp.float32)
+
+    def __call__(self, x):
+        return jax.nn.relu(self.a @ x + self.b)
+
+
+class TestFP8(unittest.TestCase):
+    def setUp(self):
+        # Create some test data
+        self.array_float32 = jnp.array([1.0, 2.0, 3.0], dtype=jnp.float32)
+    
+    def test_cast_function_fwd_bwd(self):
+        # Create test module
+        module = MLP()
+        for bwd_dtype in [jnp.float32]:
+            set_forward_backward_precision(jnp.float8_e5m2, bwd_dtype)
+
+            def loss_fn(mdl, inp):
+                out = mdl(inp)
+                return jnp.sum(out)
+            loss_fn_fp8 = cast_function_fwd_bwd(loss_fn)
+            
+            x = jnp.ones((10,1), dtype=jnp.float32)
+
+            # test forward pass
+            # the output should be the backward datatype as we only cast multiplications to fwd
+            output = loss_fn_fp8(module, x)
+            output_original = loss_fn(module, x)
+            print(output)
+            print(output_original)
+            self.assertTrue(np.allclose(output, output_original, atol=1e-4))
+            self.assertEqual(output.dtype, bwd_dtype)
+            
+            # test backward pass
+            grad_fn_fp8 = jax.grad(loss_fn_fp8)
+            grad_fn = jax.grad(loss_fn)
+            grads_fp8 = grad_fn_fp8(module, x)
+            grads = grad_fn(module, x)
+
+            # as MLP and x all have the same values, the gradients should be the same
+            # (for other values, the gradients will differ slightly due to quantization errors)
+            self.assertTrue(np.allclose(grads_fp8.a, grads.a, atol=1e-4))
+            self.assertTrue(np.allclose(grads_fp8.b, grads.b, atol=1e-4))
+
+            self.assertEqual(grads_fp8.a.dtype, bwd_dtype)
+            self.assertEqual(grads_fp8.b.dtype, bwd_dtype)
+
+            # test now with values where quantization errors are larger
+            x = jnp.arange(10, dtype=bwd_dtype).reshape((10,1)) + 1.0
+            output = loss_fn_fp8(module, x)
+            output_original = loss_fn(module, x)
+            grads_fp8 = grad_fn_fp8(module, x)
+            grads = grad_fn(module, x)
+
+            self.assertFalse(np.allclose(output, output_original, atol=1e-4))
+            self.assertFalse(np.allclose(grads_fp8.a, grads.a, atol=1e-4))
+            # bias is in fp32, so it should be close
+            self.assertTrue(np.allclose(grads_fp8.b, grads.b, atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()