added code

Alexander · Alexander · commit adb28070b8a4 · 2025-05-12T20:16:09.000+02:00
diff --git a/mpfj/__init__.py b/mpfj/__init__.py
@@ -0,0 +1,13 @@
+from . import utils
+from . import layers
+from . import optimizers
+
+"""
+Mixed Precision for JAX (mpfj)
+
+This package provides utilities for mixed precision training in JAX.
+"""
+
+__version__ = "0.1.0"
+
+from .dtypes import set_half_precision_datatype
diff --git a/mpfj/cast.py b/mpfj/cast.py
@@ -0,0 +1,180 @@
+"""
+MIT License
+
+Copyright (c) 2025 Alexander Gräfe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Tools for mixer precision training. Methods and general code architecture are from jmp https://github.com/google-deepmind/jmp. This can be seen as a port and extension of JMP tot equinox.
+"""
+
+"""
+Functions for casting of Pytrees.
+"""
+
+import jax
+import jax.numpy as jnp
+import equinox as eqx
+
+from jaxtyping import Array, Float, Int, PyTree, PRNGKeyArray 
+
+from .dtypes import HALF_PRECISION_DATATYPE
+
+def cast_tree(tree: PyTree, dtype):
+    """
+    Casts all array elements in a PyTree to a specified data type.
+    This function traverses a PyTree and applies a type casting operation to all array elements, leaving non-array elements unchanged.
+    Args:
+        tree (PyTree): The input PyTree containing arrays and other objects.
+        dtype (numpy.dtype or str): The target data type to cast the arrays to.
+    Returns:
+        PyTree: A new PyTree with all array elements cast to the specified data type.
+    """
+    
+    def _cast(x):
+        if eqx.is_array(x):
+            return x.astype(dtype)
+        else:
+            return x
+    return jax.tree_util.tree_map(_cast, tree)
+
+
+def cast_to_float32(x: PyTree) -> PyTree:
+    """
+    Cast the input PyTree to `float32` data type.
+
+    This function takes a PyTree and casts all its elements to the `float32` data type.
+
+    Args:
+        x (PyTree): The input PyTree containing elements to be cast.
+
+    Returns:
+        PyTree: A new PyTree with all elements cast to `float32`.
+    """
+    """Cast to float32."""
+    return cast_tree(x, jnp.float32)
+
+
+def cast_to_float16(x: PyTree) -> PyTree:
+    """
+    Casts all elements of a PyTree to the float16 data type.
+
+    Args:
+        x (PyTree): A PyTree containing numerical data to be cast to float16.
+
+    Returns:
+        PyTree: A new PyTree with all numerical elements cast to float16.
+    """
+    return cast_tree(x, jnp.float16)
+
+
+def cast_to_bfloat16(x: PyTree) -> PyTree:
+    """
+    Casts the input PyTree to the bfloat16 data type.
+
+    Args:
+        x (PyTree): A PyTree structure containing arrays or tensors to be cast.
+
+    Returns:
+        PyTree: A PyTree with all arrays or tensors cast to the bfloat16 data type.
+    """
+    return cast_tree(x, jnp.bfloat16)
+
+
+def cast_to_full_precision(x: PyTree) -> PyTree:
+    """
+    Casts all elements of a PyTree to full precision (float32).
+
+    Args:
+        x (PyTree): The input PyTree containing elements to be cast.
+
+    Returns:
+        PyTree: A new PyTree with all elements cast to float32 precision.
+    """
+    """Cast to full precision (float32)."""
+    return cast_tree(x, jnp.float32)
+
+def cast_to_half_precision(x: PyTree) -> PyTree:
+    """
+    Cast the input PyTree to half precision.
+
+    This function converts all elements in the input PyTree to the half-precision
+    datatype (either `float16` or `bfloat16`), depending on the configuration set
+    by `set_half_precision_datatype`.
+
+    Args:
+        x (PyTree): The input PyTree containing elements to be cast to half precision.
+
+    Returns:
+        PyTree: A new PyTree with all elements cast to the half-precision datatype.
+    """
+    """Cast to half precision (float16/bfloat16, depending on with what we called set_half_precision_datatype)."""
+    return cast_tree(x, HALF_PRECISION_DATATYPE)
+
+
+def force_full_precision(func, return_dtype=jnp.float16):
+    """
+    A decorator to enforce full precision (float32) for the inputs and outputs of a function.
+    This decorator ensures that all array arguments passed to the decorated function are 
+    converted to float32 precision before the function is executed. Additionally, it converts 
+    the outputs of the function to the specified `return_dtype` if they are arrays.
+    Args:
+        func (callable): The function to be decorated.
+        return_dtype (dtype): The desired data type for the function's output arrays.
+    Returns:
+        callable: The wrapped function with enforced input and output precision.
+    Example:
+        @force_full_precision
+        def my_function(x, y):
+            return x + y
+        # All array inputs to `my_function` will be cast to float32, and the output
+        # will be cast to the specified `return_dtype` if it is an array.
+    """
+
+    def wrapper(*args, **kwargs):
+        args_full_precision = []
+        for arg in args:
+            if eqx.is_array(arg):
+                args_full_precision.append(arg.astype(jnp.float32))
+            else:
+                args_full_precision.append(arg)
+        args_full_precision = tuple(args_full_precision)
+
+        kwargs_full_precision = {}
+        for key, value in kwargs.items():
+            if eqx.is_array(value):
+                kwargs_full_precision[key] = value.astype(jnp.float32)
+            else:
+                kwargs_full_precision[key] = value
+
+        results = func(*args_full_precision, **kwargs_full_precision)
+
+        if type(results) == tuple:
+            results_converted = []
+            for r in results:
+                if eqx.is_array(r):
+                    results_converted.append(r.astype(return_dtype))
+                else:
+                    results_converted.append(r)
+            return tuple(results_converted)
+        elif eqx.is_array(results):
+            return results.astype(return_dtype)
+        return results
+    
+    return wrapper
diff --git a/mpfj/dtypes.py b/mpfj/dtypes.py
@@ -0,0 +1,12 @@
+import jax.numpy as jnp
+
+HALF_PRECISION_DATATYPE = jnp.float16
+
+def set_half_precision_datatype(datatype):
+    """
+    Set the half precision datatype for the module.
+    
+    Args:
+        datatype: The datatype to set as half precision (e.g., jnp.float16).
+    """
+    HALF_PRECISION_DATATYPE = datatype
diff --git a/mpfj/grad_tools.py b/mpfj/grad_tools.py
@@ -0,0 +1,176 @@
+"""
+MIT License
+
+Copyright (c) 2025 Alexander Gräfe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Tools for mixer precision training. Methods and general code architecture are from jmp https://github.com/google-deepmind/jmp. This can be seen as a port and extension of JMP tot equinox.
+"""
+
+"""Filtering tools for mixer precision training."""
+
+
+import jax
+import jax.numpy as jnp
+import equinox as eqx
+
+import optax
+
+import cast as cast
+import loss_scaling as loss_scaling
+
+from jaxtyping import PyTree, Bool
+
+
+def select_tree(pred: jnp.ndarray, a: PyTree, b: PyTree) -> PyTree:
+    """
+    Selects elements from one of two pytrees based on a scalar boolean predicate.
+
+    This function traverses two input pytrees (`a` and `b`) and selects elements
+    from either `a` or `b` based on the value of the scalar boolean `pred`. If
+    `pred` is `True`, elements from `a` are selected; otherwise, elements from `b`
+    are selected. Non-array elements in the pytrees are taken directly from `a`.
+
+    Args:
+        pred (jnp.ndarray): A scalar boolean array (`jnp.bool_`) that determines
+            which pytree to select elements from.
+        a (PyTree): The first pytree to select elements from.
+        b (PyTree): The second pytree to select elements from.
+
+    Returns:
+        PyTree: A new pytree with elements selected from `a` or `b` based on `pred`.
+
+    Raises:
+        AssertionError: If `pred` is not a scalar boolean array (`jnp.bool_`).
+    """
+    """Selects a pytree based on the given predicate."""
+    assert pred.ndim == 0 and pred.dtype == jnp.bool_, "expected boolean scalar"
+    def _select_leaf(x1, x2):
+        if eqx.is_array(x1):
+            return jax.lax.select(pred, x1, x2)
+        else:
+            return x1
+
+    return jax.tree_util.tree_map(_select_leaf, a, b)
+
+
+def filter_grad(func, scaling: loss_scaling.DynamicLossScaling, has_aux=False) -> PyTree:
+    """
+    Filters the gradients of a function based on a predicate.
+
+    This function computes the gradients of the given function `func` with respect
+    to its arguments (`args` and `kwargs`). It then filters the gradients based on
+    a predicate function that checks whether the gradients are finite. The filtered
+    gradients are returned as a new pytree.
+
+    Args:
+        func (callable): The function to compute gradients for. This function must only use pytrees as parameters!
+        has_aux (bool): If True, the function is expected to return auxiliary values along with the gradients.
+    Returns:
+        callable: A function that computes the filtered gradients of `func`. It returns the grad, the new loss scaling, and a boolean indicating whether the gradients are finite (and the aux-value if has_aux is true).
+    """
+    def wrapper(*args, **kwargs):
+        args_cast = tuple([cast.cast_to_half_precision(x) for x in args])
+        kwargs_cast = {k: cast.cast_to_half_precision(v) for k, v in kwargs.items()}
+
+        func_scaled = loss_scaling.scaled(func, scaling)
+
+        dfunc_scaled = eqx.filter_grad(func_scaled, has_aux=has_aux)
+
+        if has_aux:
+            aux, grad = dfunc_scaled(*args_cast, **kwargs_cast)
+            grads_finite = loss_scaling.all_finite(grad)
+            loss_scaling_new = scaling.adjust(grads_finite)
+            grad = loss_scaling_new.unscale(grad)
+            return aux, loss_scaling_new, grads_finite, grad
+        else:
+            grad = dfunc_scaled(*args_cast, **kwargs_cast)
+            grads_finite = loss_scaling.all_finite(grad)
+            loss_scaling_new = scaling.adjust(grads_finite)
+            grad = loss_scaling_new.unscale(grad)
+            return loss_scaling_new, grads_finite, grad
+
+    return wrapper
+
+
+def filter_value_and_grad(func, scaling: loss_scaling.DynamicLossScaling, has_aux=False) -> PyTree:
+    """
+    Wraps a function to compute its value and gradient with support for mixed precision
+    and dynamic loss scaling.
+    Args:
+        func (Callable): The function for which the value and gradient are to be computed.
+        scaling (loss_scaling.DynamicLossScaling): An instance of DynamicLossScaling to
+            handle loss scaling and gradient unscaling.
+        has_aux (bool, optional): Indicates whether the function `func` returns auxiliary
+            outputs along with the main value. Defaults to False.
+    Returns:
+        Callable: A wrapped function that computes the value, gradient, and additional
+        information:
+            - If `has_aux` is True:
+                ((value, aux), loss_scaling_new, grads_finite, grad)
+            - If `has_aux` is False:
+                (value, loss_scaling_new, grads_finite, grad)
+        Where:
+            - `value`: The computed value of the function.
+            - `aux`: Auxiliary outputs returned by the function (if `has_aux` is True).
+            - `loss_scaling_new`: The updated loss scaling object.
+            - `grads_finite`: A boolean indicating whether all gradients are finite.
+            - `grad`: The computed gradients, unscaled.
+    """
+
+    def wrapper(*args, **kwargs):
+        args_cast = tuple([cast.cast_to_half_precision(x) for x in args])
+        kwargs_cast = {k: cast.cast_to_half_precision(v) for k, v in kwargs.items()}
+
+        func_scaled = loss_scaling.scaled(func, scaling)
+
+        dfunc_scaled = eqx.filter_value_and_grad(func_scaled, has_aux=has_aux)
+
+        if has_aux:
+            (value, aux), grad = dfunc_scaled(*args_cast, **kwargs_cast)
+            grads_finite = loss_scaling.all_finite(grad)
+            loss_scaling_new = scaling.adjust(grads_finite)
+            grad = loss_scaling_new.unscale(grad)
+            value = loss_scaling_new.unscale(value)
+            return (value, aux), loss_scaling_new, grads_finite, grad
+        else:
+            value, grad = dfunc_scaled(*args_cast, **kwargs_cast)
+            grads_finite = loss_scaling.all_finite(grad)
+            loss_scaling_new = scaling.adjust(grads_finite)
+            grad = loss_scaling_new.unscale(grad)
+            value = loss_scaling_new.unscale(value)
+            return value, loss_scaling_new, grads_finite, grad
+
+    return wrapper
+
+
+def optimizer_update(model: PyTree, optimizer: optax.GradientTransformation, optimizer_state: PyTree, grads: PyTree, grads_finite: Bool):
+    
+    # optimizer step
+    updates, new_optimizer_state = optimizer.update(
+        grads, optimizer_state, eqx.filter(model, eqx.is_array)
+    )
+    new_model = eqx.apply_updates(model, updates)
+
+    # only apply updates to the model and optimizer state if gradients are finite
+    model = select_tree(grads_finite, new_model, model)
+    optimizer_state = select_tree(grads_finite, new_optimizer_state, optimizer_state)
+
+    return model, optimizer_state
diff --git a/mpfj/loss_scaling.py b/mpfj/loss_scaling.py