Fix PyTorch backend tensor conversion and refactor variable loading

amitsrivastava78 · amitsrivastava78 · commit b0a782426729 · 2025-10-07T10:48:24.000+05:30
- Fix PyTorch backend CI failures by adding _direct_assign method for proper numpy-to-tensor conversion
- Restore JAX export functionality using jax_export.symbolic_shape for dynamic shape handling
- Refactor variable loading logic to eliminate duplication between Dense and EinsumDense layers
- Create shared utility function get_quantized_variable_load_order in keras/src/utils/variable_loading.py
- Update layer implementations to use the shared variable loading utility
- All tests passing: PyTorch backend, JAX backend, and layer-specific legacy loading tests
diff --git a/keras/src/backend/jax/core.py b/keras/src/backend/jax/core.py
@@ -4,6 +4,7 @@
 import ml_dtypes
 import numpy as np
 from absl import logging
+from jax import export as jax_export
 
 from keras.src import tree
 from keras.src.backend import config
@@ -109,7 +110,7 @@ def _initialize_variable_with_sharding(
 
     # Log initialization details
     total_elements = np.prod(variable._shape)
-    element_size = 4  # float32 = 4 bytes
+    element_size = np.dtype(variable.dtype).itemsize
     total_size_mb = (total_elements * element_size) / (1024 * 1024)
 
     logging.info(f"{log_prefix}: Creating variable '{variable.path}'")
@@ -202,7 +203,7 @@ def _maybe_create_strong_reference(self, value):
                 else:
                     # For non-sharded arrays, hold a ref to the array itself.
                     self._strong_reference = value
-            except Exception:
+            except (AttributeError, TypeError):
                 # If we can't set attributes (e.g., during tracing), skip
                 pass
 
@@ -603,31 +604,26 @@ def compute_output_spec(fn, *args, **kwargs):
             else:
                 maybe_symbolic_kwargs[k] = v
 
-        # Second, find out if there are dynamic shapes
-        has_none = False
-        for x in tree.flatten((maybe_symbolic_args, maybe_symbolic_kwargs)):
-            if isinstance(x, KerasTensor) and any(d is None for d in x.shape):
-                has_none = True
-
-        def convert_keras_tensor_to_jax(x, fill_value=None):
+        # We create a single dynamic dimension and reuse it instead of creating
+        # N dynamic dimensions. This is for backwards compatibility. Previously
+        # we would fill all dynamic dimensions with the same concrete value.
+        # This can handle the case where there is an implicit assumption that
+        # two dimensions are the same (e.g. square images).
+        #
+        # We add the constraint "dynamic_dimension>=2" to prevent JAX from
+        # assuming that the dimension can be broadcastable or squeezable. It
+        # removes this ambiguity.
+        dynamic_dimension = jax_export.symbolic_shape(
+            "(dynamic_dimension)",
+            constraints=["dynamic_dimension>=2"],
+        )[0]
+
+        def convert_keras_tensor_to_jax(x):
             if isinstance(x, KerasTensor):
-                shape = list(x.shape)
-                if fill_value:
-                    for i, e in enumerate(shape):
-                        if e is None:
-                            shape[i] = fill_value
-                jax_tensor = jax.ShapeDtypeStruct(shape, dtype=x.dtype)
-                return jax_tensor
-            if isinstance(x, dict):
-                return {
-                    k: convert_keras_tensor_to_jax(v, fill_value=fill_value)
-                    for k, v in x.items()
-                }
-            if isinstance(x, list):
-                return [
-                    convert_keras_tensor_to_jax(xi, fill_value=fill_value)
-                    for xi in x
-                ]
+                shape = tuple(
+                    [d if d is not None else dynamic_dimension for d in x.shape]
+                )
+                return jax.ShapeDtypeStruct(shape, dtype=x.dtype)
             return x
 
         def wrapped_fn(*args, **kwargs):
@@ -662,63 +658,25 @@ def to_bcoo_if_sparse(x, maybe_symbolic_x):
             with StatelessScope():
                 return fn(*rec_args, **kwargs, **static_kwargs)
 
-        if has_none:
-            ms_args_1, ms_kwargs_1 = tree.map_structure(
-                lambda x: convert_keras_tensor_to_jax(x, fill_value=83),
-                (maybe_symbolic_args, maybe_symbolic_kwargs),
-            )
-            _, jax_out_1 = jax.make_jaxpr(wrapped_fn, return_shape=True)(
-                *ms_args_1, **ms_kwargs_1
-            )
-
-            ms_args_2, ms_kwargs_2 = tree.map_structure(
-                lambda x: convert_keras_tensor_to_jax(x, fill_value=89),
-                (maybe_symbolic_args, maybe_symbolic_kwargs),
-            )
-            _, jax_out_2 = jax.make_jaxpr(wrapped_fn, return_shape=True)(
-                *ms_args_2, **ms_kwargs_2
-            )
-
-            def merge_shapes(shape1, shape2):
-                return tuple(
-                    [d1 if d1 == d2 else None for d1, d2 in zip(shape1, shape2)]
-                )
-
-            def convert_jax_specs_to_keras_tensor(x1, x2):
-                if isinstance(x1, jax.ShapeDtypeStruct):
-                    if not isinstance(x2, jax.ShapeDtypeStruct):
-                        raise ValueError("Indeterministic output ordering.")
-                    return KerasTensor(
-                        merge_shapes(x1.shape, x2.shape), dtype=x1.dtype
-                    )
-                elif isinstance(x1, jax_sparse.BCOO):
-                    if not isinstance(x2, jax_sparse.BCOO):
-                        raise ValueError("Indeterministic output ordering.")
-                    return KerasTensor(
-                        merge_shapes(x1.shape, x2.shape),
-                        dtype=x1.dtype,
-                        sparse=True,
-                    )
-                else:
-                    return x1
-
-            return tree.map_structure(
-                convert_jax_specs_to_keras_tensor, jax_out_1, jax_out_2
-            )
-
-        maybe_symbolic_args, maybe_symbolic_kwargs = tree.map_structure(
+        maybe_symbolic_args_jax, maybe_symbolic_kwargs_jax = tree.map_structure(
             convert_keras_tensor_to_jax,
             (maybe_symbolic_args, maybe_symbolic_kwargs),
         )
-        _, jax_out = jax.make_jaxpr(wrapped_fn, return_shape=True)(
-            *maybe_symbolic_args, **maybe_symbolic_kwargs
+        jax_out = jax.eval_shape(
+            wrapped_fn, *maybe_symbolic_args_jax, **maybe_symbolic_kwargs_jax
         )
 
         def convert_jax_spec_to_keras_tensor(x):
             if isinstance(x, jax.ShapeDtypeStruct):
-                return KerasTensor(x.shape, x.dtype)
+                shape = tuple(
+                    d if isinstance(d, int) else None for d in x.shape
+                )
+                return KerasTensor(shape, x.dtype)
             elif isinstance(x, jax_sparse.BCOO):
-                return KerasTensor(x.shape, x.dtype, sparse=True)
+                shape = tuple(
+                    d if isinstance(d, int) else None for d in x.shape
+                )
+                return KerasTensor(shape, x.dtype, sparse=True)
             return x
 
         return tree.map_structure(convert_jax_spec_to_keras_tensor, jax_out)
diff --git a/keras/src/layers/core/dense.py b/keras/src/layers/core/dense.py
@@ -12,6 +12,7 @@
 from keras.src.layers.input_spec import InputSpec
 from keras.src.layers.layer import Layer
 from keras.src.quantizers.quantizers import dequantize_with_sz_map
+from keras.src.utils.variable_loading import get_quantized_variable_load_order
 
 
 @keras_export("keras.layers.Dense")
@@ -306,30 +307,8 @@ def load_own_variables(self, store):
     def _legacy_load_own_variables(self, store):
         # The keys of the `store` will be saved as determined because the
         # default ordering will change after quantization
-        if self.quantization_mode == "gptq":
-            # GPTQ: bias first, then quantized_kernel
-            target_variables = [self.bias] if self.use_bias else []
-            target_variables.append(self.quantized_kernel)
-        else:
-            target_variables = [self._kernel]
-        if self.use_bias and self.quantization_mode != "gptq":
-            target_variables.append(self.bias)
-        if self.quantization_mode is not None:
-            if self.quantization_mode in ("int8", "int4"):
-                target_variables.append(self.kernel_scale)
-            elif self.quantization_mode == "float8":
-                target_variables.append(self.inputs_scale)
-                target_variables.append(self.inputs_amax_history)
-                target_variables.append(self.kernel_scale)
-                target_variables.append(self.kernel_amax_history)
-                target_variables.append(self.outputs_grad_scale)
-                target_variables.append(self.outputs_grad_amax_history)
-            elif self.quantization_mode == "gptq":
-                target_variables.append(self.kernel_scale)
-                target_variables.append(self.kernel_zero)
-                target_variables.append(self.g_idx)
-            else:
-                raise self._quantization_mode_error(self.quantization_mode)
+        target_variables = get_quantized_variable_load_order(self)
+
         for i, variable in enumerate(target_variables):
             weight_data = store[str(i)]
             variable._direct_assign(weight_data)
diff --git a/keras/src/layers/core/einsum_dense.py b/keras/src/layers/core/einsum_dense.py
@@ -16,6 +16,7 @@
 from keras.src.layers.input_spec import InputSpec
 from keras.src.layers.layer import Layer
 from keras.src.quantizers.quantizers import dequantize_with_sz_map
+from keras.src.utils.variable_loading import get_quantized_variable_load_order
 
 
 @keras_export("keras.layers.EinsumDense")
@@ -374,30 +375,8 @@ def load_own_variables(self, store):
     def _legacy_load_own_variables(self, store):
         # The keys of the `store` will be saved as determined because the
         # default ordering will change after quantization
-        if self.quantization_mode == "gptq":
-            # GPTQ: bias first, then quantized_kernel
-            target_variables = [self.bias] if self.bias is not None else []
-            target_variables.append(self.quantized_kernel)
-        else:
-            target_variables = [self._kernel]
-        if self.bias is not None and self.quantization_mode != "gptq":
-            target_variables.append(self.bias)
-        if self.quantization_mode is not None:
-            if self.quantization_mode in ("int8", "int4"):
-                target_variables.append(self.kernel_scale)
-            elif self.quantization_mode == "float8":
-                target_variables.append(self.inputs_scale)
-                target_variables.append(self.inputs_amax_history)
-                target_variables.append(self.kernel_scale)
-                target_variables.append(self.kernel_amax_history)
-                target_variables.append(self.outputs_grad_scale)
-                target_variables.append(self.outputs_grad_amax_history)
-            elif self.quantization_mode == "gptq":
-                target_variables.append(self.kernel_scale)
-                target_variables.append(self.kernel_zero)
-                target_variables.append(self.g_idx)
-            else:
-                raise self._quantization_mode_error(self.quantization_mode)
+        target_variables = get_quantized_variable_load_order(self)
+
         for i, variable in enumerate(target_variables):
             weight_data = store[str(i)]
             variable._direct_assign(weight_data)
diff --git a/keras/src/utils/variable_loading.py b/keras/src/utils/variable_loading.py
@@ -4,3 +4,73 @@
 This module provides common utilities for loading variables that may be sharded
 across multiple devices, which is useful for distributed training scenarios.
 """
+
+
+def get_quantized_variable_load_order(layer):
+    """
+    Determine the order of variables to load for quantized layers.
+
+    This function handles the complex logic for ordering variables during legacy
+    loading, which varies based on quantization mode. The ordering is important
+    because the keys in the store are saved in this specific order.
+
+    Args:
+        layer: The layer instance with quantization attributes.
+
+    Returns:
+        List of variables in the order they should be loaded.
+
+    Raises:
+        ValueError: If the quantization mode is not supported.
+    """
+    # Determine if bias should be included and how it's accessed
+    has_bias = (
+        getattr(layer, "use_bias", None)
+        if hasattr(layer, "use_bias")
+        else (layer.bias is not None)
+    )
+    bias_var = layer.bias if has_bias else None
+
+    # Start with the main kernel variable
+    if layer.quantization_mode == "gptq":
+        # GPTQ: bias first (if present), then quantized_kernel
+        target_variables = [bias_var] if bias_var is not None else []
+        target_variables.append(layer.quantized_kernel)
+    else:
+        # Standard case: kernel first
+        target_variables = [layer._kernel]
+
+    # Add bias if present and not already added (not GPTQ)
+    if bias_var is not None and layer.quantization_mode != "gptq":
+        target_variables.append(bias_var)
+
+    # Add quantization-specific variables
+    if layer.quantization_mode is not None:
+        if layer.quantization_mode in ("int8", "int4"):
+            target_variables.append(layer.kernel_scale)
+        elif layer.quantization_mode == "float8":
+            target_variables.extend(
+                [
+                    layer.inputs_scale,
+                    layer.inputs_amax_history,
+                    layer.kernel_scale,
+                    layer.kernel_amax_history,
+                    layer.outputs_grad_scale,
+                    layer.outputs_grad_amax_history,
+                ]
+            )
+        elif layer.quantization_mode == "gptq":
+            target_variables.extend(
+                [
+                    layer.kernel_scale,
+                    layer.kernel_zero,
+                    layer.g_idx,
+                ]
+            )
+        else:
+            # This should be handled by the layer's _quantization_mode_error
+            raise ValueError(
+                f"Unsupported quantization mode: {layer.quantization_mode}"
+            )
+
+    return target_variables