mlx updates post compilation (#20996)

acsweet · web-flow · commit f337bb40cd90 · 2025-03-06T14:42:40.000-08:00
diff --git a/keras/src/backend/mlx/core.py b/keras/src/backend/mlx/core.py
@@ -114,6 +114,11 @@ def convert_to_tensor(x, dtype=None, sparse=None, ragged=None):
         # load h5py._hl.dataset.Dataset object with numpy
         x = np.array(x)
 
+    if x is None:
+        # this is needed for tracking
+        # mlx.array returns a TypeError when called with None
+        raise ValueError("mlx cannot convert `None` to array")
+
     return mx.array(x, dtype=mlx_dtype)
 
 
diff --git a/keras/src/backend/mlx/export.py b/keras/src/backend/mlx/export.py
@@ -1,8 +1,45 @@
+from keras.src import layers
+from keras.src import tree
+
+
 class MlxExportArchive:
+    def __init__(self):
+        self._backend_variables = []
+        self._backend_trainable_variables = []
+        self._backend_non_trainable_variables = []
+
     def track(self, resource):
-        raise NotImplementedError(
-            "`track` is not implemented in the mlx backend."
-        )
+        if not isinstance(resource, layers.Layer):
+            raise ValueError(
+                "Invalid resource type. Expected an instance of a "
+                "MLX-based Keras `Layer` or `Model`. "
+                f"Received instead an object of type '{type(resource)}'. "
+                f"Object received: {resource}"
+            )
+
+        if isinstance(resource, layers.Layer):
+            # Variables in the lists below are actually part of the trackables
+            # that get saved, because the lists are created in __init__.
+            trainable_variables = resource.trainable_variables
+            non_trainable_variables = resource.non_trainable_variables
+
+            self._tf_trackable.trainable_variables += tree.map_structure(
+                self._convert_to_tf_variable, trainable_variables
+            )
+            self._tf_trackable.non_trainable_variables += tree.map_structure(
+                self._convert_to_tf_variable, non_trainable_variables
+            )
+            self._tf_trackable.variables = (
+                self._tf_trackable.trainable_variables
+                + self._tf_trackable.non_trainable_variables
+            )
+
+            self._backend_trainable_variables += trainable_variables
+            self._backend_non_trainable_variables += non_trainable_variables
+            self._backend_variables = (
+                self._backend_trainable_variables
+                + self._backend_non_trainable_variables
+            )
 
     def add_endpoint(self, name, fn, input_signature=None, **kwargs):
         raise NotImplementedError(
diff --git a/keras/src/backend/mlx/math.py b/keras/src/backend/mlx/math.py
@@ -1,10 +1,9 @@
 import math
-import operator
 
 import mlx.core as mx
-import numpy as np
 
 from keras.src.backend import standardize_dtype
+from keras.src.backend.common.backend_utils import canonicalize_axis
 from keras.src.backend.mlx.core import convert_to_tensor
 from keras.src.backend.mlx.linalg import det
 from keras.src.utils.module_utils import scipy
@@ -23,26 +22,31 @@ def _segment_reduction_fn(
     if num_segments is None:
         num_segments = mx.max(segment_ids) + 1
 
-    valid_indices = segment_ids >= 0
-    valid_data = mx.array(
-        np.array(data)[valid_indices]  # MLX does not support boolean indices
-    )
-    valid_segment_ids = mx.array(np.array(segment_ids)[valid_indices])
-
-    data_shape = list(valid_data.shape)
-    data_shape[0] = num_segments
+    mask = segment_ids >= 0
+    # pack segment_ids < 0 into index 0 and then handle below
+    safe_segment_ids = mx.where(mask, segment_ids, 0)
 
     if not sorted:
-        sort_indices = mx.argsort(valid_segment_ids)
-        valid_segment_ids = valid_segment_ids[sort_indices]
-        valid_data = valid_data[sort_indices]
+        sort_indices = mx.argsort(safe_segment_ids)
+        safe_segment_ids = mx.take(safe_segment_ids, sort_indices)
+        data = mx.take(data, sort_indices, axis=0)
+        mask = mx.take(mask, sort_indices)
+
+    # expand mask dimensions to match data dimensions
+    for i in range(1, len(data.shape)):
+        mask = mx.expand_dims(mask, axis=i)
+
+    data_shape = list(data.shape)
+    data_shape[0] = num_segments
 
     if reduction_method == "max":
-        result = mx.ones(data_shape, dtype=valid_data.dtype) * -mx.inf
-        result = result.at[valid_segment_ids].maximum(valid_data)
+        masked_data = mx.where(mask, data, -mx.inf)
+        result = mx.ones(data_shape, dtype=data.dtype) * -mx.inf
+        result = result.at[safe_segment_ids].maximum(masked_data)
     else:  # sum
-        result = mx.zeros(data_shape, dtype=valid_data.dtype)
-        result = result.at[valid_segment_ids].add(valid_data)
+        masked_data = mx.where(mask, data, 0)
+        result = mx.zeros(data_shape, dtype=data.dtype)
+        result = result.at[safe_segment_ids].add(masked_data)
 
     return result
 
@@ -154,19 +158,6 @@ def irfft(x, fft_length=None):
     return real_output
 
 
-def _canonicalize_axis(axis, num_dims):
-    # Ref: jax.scipy.signal.stft
-    """Canonicalize an axis in [-num_dims, num_dims) to [0, num_dims)."""
-    axis = operator.index(axis)
-    if not -num_dims <= axis < num_dims:
-        raise ValueError(
-            f"axis {axis} is out of bounds for array of dimension {num_dims}"
-        )
-    if axis < 0:
-        axis = axis + num_dims
-    return axis
-
-
 def _create_sliding_windows(x, window_size, step):
     batch_size, signal_length, _ = x.shape
     num_windows = (signal_length - window_size) // step + 1
@@ -187,7 +178,7 @@ def _create_sliding_windows(x, window_size, step):
 
 def _stft(x, window, nperseg, noverlap, nfft, axis=-1):
     # Ref: jax.scipy.signal.stft
-    axis = _canonicalize_axis(axis, x.ndim)
+    axis = canonicalize_axis(axis, x.ndim)
     result_dtype = mx.complex64
 
     if x.size == 0:
@@ -364,8 +355,8 @@ def _istft(
     # Ref: jax.scipy.signal.istft
     if Zxx.ndim < 2:
         raise ValueError("Input stft must be at least 2d!")
-    freq_axis = _canonicalize_axis(freq_axis, Zxx.ndim)
-    time_axis = _canonicalize_axis(time_axis, Zxx.ndim)
+    freq_axis = canonicalize_axis(freq_axis, Zxx.ndim)
+    time_axis = canonicalize_axis(time_axis, Zxx.ndim)
 
     if freq_axis == time_axis:
         raise ValueError("Must specify differing time and frequency axes!")
diff --git a/keras/src/backend/mlx/nn.py b/keras/src/backend/mlx/nn.py
@@ -1295,9 +1295,6 @@ def dot_product_attention(
     key = convert_to_tensor(key)
     value = convert_to_tensor(value)
 
-    query = convert_to_tensor(query)
-    key = convert_to_tensor(key)
-    value = convert_to_tensor(value)
     if len(query.shape) != 4:
         raise ValueError(
             "`dot_product_attention` only supports 4D inputs. "
diff --git a/keras/src/layers/core/einsum_dense_test.py b/keras/src/layers/core/einsum_dense_test.py
@@ -382,6 +382,10 @@ def test_lora_rank_argument(self):
 
     # Test quantization-related (int8 and float8) methods
 
+    @pytest.mark.skipif(
+        backend.backend() == "mlx",
+        reason="mlx backend doesn't int8 matmul.",
+    )
     def test_quantize_int8(self):
         layer = layers.EinsumDense(
             equation="ab,bcd->acd",
@@ -470,6 +474,10 @@ def test_quantize_int8(self):
         ("btd,ndh->btnh", "btd,ndh->btnh", (None, 2, 8), (1, 2, 4)),
         ("btd,df->btf", "btd,df->btf", (None, 4), (1, 2, 4)),
     )
+    @pytest.mark.skipif(
+        backend.backend() == "mlx",
+        reason="mlx backend doesn't int8 matmul.",
+    )
     def test_quantize_int8_with_specific_equations(
         self, equation, output_shape, input_shape
     ):
@@ -608,6 +616,11 @@ def test_quantize_invalid_mode(self, mode):
     def test_quantize_dtype_argument(
         self, dtype, num_trainable_weights, num_non_trainable_weights
     ):
+        if backend.backend() == "mlx":
+            if "int8" in dtype:
+                self.skipTest("mlx backend doesn't support int8 matmul")
+            if "float8" in dtype:
+                self.skipTest("mlx backend doesn't support float8")
         self.run_layer_test(
             layers.EinsumDense,
             init_kwargs={
@@ -630,6 +643,10 @@ def test_quantize_dtype_argument(
         ("btd,ndh->btnh", "btd,ndh->btnh", (1, 4, 32), (1, 4, 8, 16)),
     )
     @pytest.mark.requires_trainable_backend
+    @pytest.mark.skipif(
+        backend.backend() == "mlx",
+        reason="mlx backend doesn't int8 matmul.",
+    )
     def test_quantize_int8_when_lora_enabled(
         self, equation, input_shape, output_shape
     ):
diff --git a/keras/src/layers/normalization/spectral_normalization.py b/keras/src/layers/normalization/spectral_normalization.py
@@ -1,3 +1,4 @@
+from keras.src import backend
 from keras.src import initializers
 from keras.src import ops
 from keras.src.api_export import keras_export
@@ -76,17 +77,33 @@ def build(self, input_shape):
 
     def call(self, inputs, training=False):
         if training:
-            new_vector_u, new_kernel = ops.cond(
-                ops.all(ops.equal(self.kernel.value, 0)),
-                lambda: (self.vector_u.value, self.kernel.value),
-                self.normalized_weights,
-            )
+            if backend.backend() == "mlx":
+                # ops.cond is non-compilable with mlx backend
+                new_vector_u, new_kernel = self._mlx_get_kernel_update()
+            else:
+                new_vector_u, new_kernel = ops.cond(
+                    ops.all(ops.equal(self.kernel.value, 0)),
+                    lambda: (self.vector_u.value, self.kernel.value),
+                    self.normalized_weights,
+                )
             self.vector_u.assign(new_vector_u)
             self.kernel.assign(new_kernel)
 
         output = self.layer(inputs)
         return ops.cast(output, inputs.dtype)
 
+    def _mlx_get_kernel_update(self):
+        kernel_all_zero = ops.all(ops.equal(self.kernel.value, 0))
+        kernel_all_zero = ops.stop_gradient(kernel_all_zero)
+        normalized_vector_u, normalized_kernel = self.normalized_weights()
+        new_vector_u = ops.where(
+            kernel_all_zero, self.vector_u.value, normalized_vector_u
+        )
+        new_kernel = ops.where(
+            kernel_all_zero, self.kernel.value, normalized_kernel
+        )
+        return new_vector_u, new_kernel
+
     def compute_output_shape(self, input_shape):
         return self.layer.compute_output_shape(input_shape)
 
diff --git a/keras/src/layers/preprocessing/normalization.py b/keras/src/layers/preprocessing/normalization.py
@@ -122,6 +122,8 @@ def __init__(
                 f"must be set. Received: mean={mean} and variance={variance}"
             )
 
+        self._mlx_inputs_captured = False
+
     def build(self, input_shape):
         if input_shape is None:
             return
@@ -297,6 +299,19 @@ def finalize_state(self):
         self.variance = ops.reshape(self.adapt_variance, self._broadcast_shape)
         self.variance = ops.cast(self.variance, self.compute_dtype)
 
+    def _mlx_capture_inputs(self):
+        # due to mlx's lazy evaluation
+        # when compiled, the mean and variance need to be evaluated
+        # or the values will not be captured and an error thrown
+        if self._mlx_inputs_captured:
+            return
+
+        from keras.src.utils.module_utils import mlx
+
+        mlx.core.eval(self.mean)
+        mlx.core.eval(self.variance)
+        self._mlx_inputs_captured = True
+
     def call(self, inputs):
         # This layer can be called in tf.data
         # even with another backend after it has been adapted.
@@ -314,6 +329,8 @@ def call(self, inputs):
         # possible to cause breakage when using this layer in tf.data.
         mean = self.convert_weight(self.mean)
         variance = self.convert_weight(self.variance)
+        if self.backend.name == "mlx":
+            self._mlx_capture_inputs()
         if self.invert:
             return self.backend.numpy.add(
                 mean,
diff --git a/keras/src/layers/reshaping/up_sampling2d_test.py b/keras/src/layers/reshaping/up_sampling2d_test.py
@@ -128,6 +128,9 @@ def test_upsampling_2d_correctness(self):
     def test_upsampling_2d_various_interpolation_methods(self):
         input_shape = (2, 2, 1, 3)
         x = np.arange(np.prod(input_shape)).reshape(input_shape)
+        if backend.backend() == "mlx":
+            # mlx does not support integer matmul
+            x = x.astype("float32")
         for interpolation in ["nearest", "bilinear", "bicubic"]:
             layers.UpSampling2D(size=(1, 2), interpolation=interpolation)(x)
 
diff --git a/keras/src/losses/losses.py b/keras/src/losses/losses.py
@@ -1518,6 +1518,12 @@ def _return_labels_unconverted():
         # Returns the labels unchanged if they are non-binary
         return y_true
 
+    if backend.backend() == "mlx":
+        # ops.cond is non-compilable with mlx backend
+        return ops.where(
+            is_binary, _convert_binary_labels(), _return_labels_unconverted()
+        )
+
     updated_y_true = ops.cond(
         is_binary, _convert_binary_labels, _return_labels_unconverted
     )
diff --git a/keras/src/models/model_test.py b/keras/src/models/model_test.py
@@ -779,6 +779,12 @@ def test_functional_list_outputs_invalid_nested_list_losses(self):
         ("float8", "float8"),
     )
     def test_quantize(self, mode):
+        if backend.backend() == "mlx":
+            self.skipTest(
+                "mlx backend does not support float8."
+                if mode == "float8"
+                else "mlx backend does not support integer matmul"
+            )
         model = _get_model()
         x1 = np.random.rand(2, 3)
         x2 = np.random.rand(2, 3)
@@ -1232,8 +1238,8 @@ def test_export_error(self):
             with self.assertRaisesRegex(
                 NotImplementedError,
                 (
-                    r"`export_saved_model` only currently supports the "
-                    r"tensorflow, jax and torch backends."
+                    r"`ExportArchive` is only compatible with "
+                    r"TensorFlow, JAX and Torch backends."
                 ),
             ):
                 model.export(temp_filepath, format="tf_saved_model")
diff --git a/keras/src/optimizers/loss_scale_optimizer.py b/keras/src/optimizers/loss_scale_optimizer.py