implement batch_normalization (#19543)

lkarthee · web-flow · commit 47c032d5abf8 · 2024-04-18T09:18:01.000-07:00
diff --git a/keras/backend/exports.py b/keras/backend/exports.py
@@ -7,6 +7,9 @@
 elif backend.backend() == "jax":
     BackendVariable = backend.jax.core.Variable
     backend_name_scope = backend.common.name_scope.name_scope
+elif backend.backend() == "mlx":
+    BackendVariable = backend.mlx.core.Variable
+    backend_name_scope = backend.common.name_scope.name_scope
 elif backend.backend() == "torch":
     BackendVariable = backend.torch.core.Variable
     backend_name_scope = backend.common.name_scope.name_scope
diff --git a/keras/backend/mlx/core.py b/keras/backend/mlx/core.py
@@ -1,12 +1,12 @@
 import mlx.core as mx
 import numpy as np
-import tree
+from keras.utils import tree
 
 from keras.backend.common import KerasVariable
 from keras.backend.common import standardize_dtype
 from keras.backend.common.keras_tensor import KerasTensor
 from keras.backend.common.stateless_scope import StatelessScope
-from keras.utils.nest import pack_sequence_as
+from keras.utils.tree import pack_sequence_as
 
 SUPPORTS_SPARSE_TENSORS = False
 
diff --git a/keras/backend/mlx/nn.py b/keras/backend/mlx/nn.py
@@ -304,9 +304,22 @@ def moments(x, axes, keepdims=False, synchronized=False):
 def batch_normalization(
     x, mean, variance, axis, offset=None, scale=None, epsilon=1e-3
 ):
-    raise NotImplementedError(
-        "MLX backend doesn't support batch normalization yet."
-    )
+    shape = [1] * len(x.shape)
+    shape[axis] = mean.shape[0]
+    mean = mx.reshape(mean, shape)
+    variance = mx.reshape(variance, shape)
+
+    inv = mx.rsqrt(variance + epsilon)
+    if scale is not None:
+        scale = mx.reshape(scale, shape)
+        inv = inv * scale
+
+    res = -mean * inv
+    if offset is not None:
+        offset = mx.reshape(offset, shape)
+        res = res + offset
+
+    return mx.add(x * inv, res)
 
 
 def ctc_loss(
diff --git a/keras/backend/mlx/numpy.py b/keras/backend/mlx/numpy.py
@@ -900,6 +900,12 @@ def divide(x1, x2):
     return mx.divide(x1, x2)
 
 
+def divide_no_nan(x1, x2):
+    x1 = convert_to_tensor(x1)
+    x2 = convert_to_tensor(x2)
+    return mx.where(x2 == 0, 0, mx.divide(x1, x2))
+
+
 def true_divide(x1, x2):
     return divide(x1, x2)
 
diff --git a/keras/backend/mlx/trainer.py b/keras/backend/mlx/trainer.py
@@ -1,6 +1,6 @@
 import mlx.core as mx
 import numpy as np
-import tree
+from keras.utils import tree
 
 from keras import backend
 from keras import callbacks as callbacks_module
@@ -141,7 +141,7 @@ def compute_loss_and_updates(
             # Note that this is needed for the regularization loss, which need
             # the latest value of train/non-trainable variables.
             loss = self.compute_loss(
-                x, y, y_pred, sample_weight, allow_empty=True
+                x, y, y_pred, sample_weight
             )
         if losses:
             loss += ops.sum(losses)