From 137a37faf59ca1817f30341d57946132abcf7e8f Mon Sep 17 00:00:00 2001
From: divyashreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 20:59:48 +0000
Subject: [PATCH 01/31] initial code dump

---
 keras/src/distillation/distiller.py      | 246 +++++++++++++
 keras/src/distillation/distiller_test.py | 442 +++++++++++++++++++++++
 keras/src/distillation/strategies.py     | 159 ++++++++
 3 files changed, 847 insertions(+)
 create mode 100644 keras/src/distillation/distiller.py
 create mode 100644 keras/src/distillation/distiller_test.py
 create mode 100644 keras/src/distillation/strategies.py

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
new file mode 100644
index 000000000000..5de5830bc733
--- /dev/null
+++ b/keras/src/distillation/distiller.py
@@ -0,0 +1,246 @@
+"""Distiller class for knowledge distillation in KerasHub."""
+
+import keras
+from keras.src.api_export import keras_export
+
+
+@keras_export("keras.distillation.Distiller")
+class Distiller(keras.Model):
+    """Knowledge distillation model that trains a student to mimic a teacher.
+    This class implements knowledge distillation by training a smaller student
+    model to replicate the behavior of a larger teacher model. The teacher model
+    is kept frozen while the student learns from both ground truth labels and
+    the teacher's soft predictions.
+    Args:
+        teacher: A keras.Model that provides target outputs. Must be frozen.
+        student: A keras.Model that will be trained to mimic the teacher.
+        strategies: List of distillation strategies to apply. Defaults to
+            logits distillation only.
+        student_loss_fn: Loss function for student's task loss. Defaults to
+            SparseCategoricalCrossentropy.
+        alpha: Weight for student loss vs distillation loss. Defaults to 0.5.
+        temperature: Temperature for softening logits in distillation.
+            Defaults to 2.0.
+        **kwargs: Additional arguments passed to keras.Model.
+    Example:
+    ```python
+    # Load teacher and student models
+    teacher = keras.models.GemmaCausalLM.from_preset("gemma_2b_en")
+    student = keras.models.GemmaCausalLM.from_preset("gemma_350m_en")
+    # Freeze teacher
+    teacher.trainable = False
+    # Create distiller
+    distiller = keras.distillation.Distiller(
+        teacher=teacher,
+        student=student,
+        alpha=0.5,
+        temperature=2.0
+    )
+    # Compile and train
+    distiller.compile(optimizer=keras.optimizers.Adam())
+    distiller.fit(X_train, y_train, epochs=3)
+    # Use distilled student for inference
+    trained_student = distiller.student
+    output = trained_student.generate("Hello, world!")
+    ```
+    """
+
+    def __init__(
+        self,
+        teacher,
+        student,
+        strategies=None,
+        student_loss_fn=None,
+        alpha=0.5,
+        temperature=2.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Store teacher and student models
+        self.teacher = teacher
+        self.student = student
+
+        # Ensure teacher is frozen
+        self.teacher.trainable = False
+
+        # Set up strategies
+        if strategies is None:
+            from keras.src.distillation.strategies import LogitsDistillation
+
+            strategies = [LogitsDistillation(temperature=temperature)]
+        self.strategies = strategies
+
+        # Set up loss functions
+        if student_loss_fn is None:
+            # Use from_logits=False by default to handle both logits and
+            # probabilities
+            student_loss_fn = keras.losses.SparseCategoricalCrossentropy(
+                from_logits=False
+            )
+        self.student_loss_fn = student_loss_fn
+        self.alpha = alpha
+        self.temperature = temperature
+
+        # Track losses for monitoring
+        self.student_loss_tracker = keras.metrics.Mean(name="student_loss")
+        self.distillation_loss_tracker = keras.metrics.Mean(
+            name="distillation_loss"
+        )
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+
+    def call(self, inputs, training=None):
+        """Forward pass - returns student outputs for inference."""
+        return self.student(inputs, training=training)
+
+    def compile(
+        self,
+        optimizer="rmsprop",
+        loss=None,
+        metrics=None,
+        loss_weights=None,
+        weighted_metrics=None,
+        run_eagerly=None,
+        steps_per_execution=None,
+        jit_compile=None,
+        **kwargs,
+    ):
+        """Configure the distiller for training.
+        Args:
+            optimizer: Optimizer for training the student model.
+            loss: Ignored - uses student_loss_fn and distillation strategies.
+            metrics: Ignored - handled internally by distiller.
+            **kwargs: Additional arguments passed to keras.Model.compile().
+        """
+        # Store compile arguments for use in train_step
+        self._compile_optimizer = optimizer
+        self._compile_metrics = metrics or []
+
+        # Call parent compile with minimal configuration to avoid TrackedList
+        # issues
+        # We handle loss and metrics manually in train_step/test_step
+        super().compile(
+            optimizer=optimizer,
+            loss=None,  # We handle loss manually in train_step
+            metrics=None,  # We handle metrics manually to avoid TrackedList
+            # issues
+            **kwargs,
+        )
+
+    def reset_metrics(self):
+        """Reset metrics to avoid TrackedList issues."""
+        # Reset our custom loss trackers
+        self.student_loss_tracker.reset_state()
+        self.distillation_loss_tracker.reset_state()
+        self.total_loss_tracker.reset_state()
+
+    @property
+    def metrics(self):
+        """Return our custom metrics to avoid TrackedList issues."""
+        return [
+            self.student_loss_tracker,
+            self.distillation_loss_tracker,
+            self.total_loss_tracker,
+        ]
+
+    def train_step(self, data):
+        """Custom training step for knowledge distillation."""
+        x, y = data
+
+        # Ensure y is the right shape for sparse categorical loss
+        if hasattr(y, "shape") and len(y.shape) > 1 and y.shape[-1] == 1:
+            y = keras.ops.squeeze(y, axis=-1)
+
+        # Get teacher predictions (no gradients)
+        teacher_outputs = self.teacher(x, training=False)
+        teacher_outputs = keras.ops.stop_gradient(teacher_outputs)
+
+        # Get student predictions
+        student_outputs = self.student(x, training=True)
+
+        # Compute student loss
+        student_loss = self.student_loss_fn(y, student_outputs)
+
+        # Compute distillation loss
+        distillation_loss = 0.0
+        for strategy in self.strategies:
+            distillation_loss += strategy.compute_loss(
+                teacher_outputs, student_outputs
+            )
+
+        # Combine losses
+        total_loss = (
+            self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+        )
+
+        # Add losses to model for Keras to handle gradients
+        self.add_loss(total_loss)
+
+        # Update loss trackers for monitoring
+        self.student_loss_tracker.update_state(student_loss)
+        self.distillation_loss_tracker.update_state(distillation_loss)
+        self.total_loss_tracker.update_state(total_loss)
+
+        # Return metrics as simple dict (no TrackedList issues)
+        return {
+            "student_loss": self.student_loss_tracker.result(),
+            "distillation_loss": self.distillation_loss_tracker.result(),
+            "total_loss": self.total_loss_tracker.result(),
+        }
+
+    def test_step(self, data):
+        """Custom test step for knowledge distillation."""
+        x, y = data
+
+        # Ensure y is the right shape for sparse categorical loss
+        if hasattr(y, "shape") and len(y.shape) > 1 and y.shape[-1] == 1:
+            y = keras.ops.squeeze(y, axis=-1)
+
+        # Get teacher predictions (no gradients)
+        teacher_outputs = self.teacher(x, training=False)
+        teacher_outputs = keras.ops.stop_gradient(teacher_outputs)
+
+        # Get student predictions
+        student_outputs = self.student(x, training=False)
+
+        # Compute student loss
+        student_loss = self.student_loss_fn(y, student_outputs)
+
+        # Compute distillation loss
+        distillation_loss = 0.0
+        for strategy in self.strategies:
+            distillation_loss += strategy.compute_loss(
+                teacher_outputs, student_outputs
+            )
+
+        # Combine losses
+        total_loss = (
+            self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+        )
+
+        # Update loss trackers for monitoring
+        self.student_loss_tracker.update_state(student_loss)
+        self.distillation_loss_tracker.update_state(distillation_loss)
+        self.total_loss_tracker.update_state(total_loss)
+
+        # Return metrics as simple dict (no TrackedList issues)
+        return {
+            "student_loss": self.student_loss_tracker.result(),
+            "distillation_loss": self.distillation_loss_tracker.result(),
+            "total_loss": self.total_loss_tracker.result(),
+        }
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        config = super().get_config()
+        config.update(
+            {
+                "teacher": self.teacher,
+                "student": self.student,
+                "strategies": self.strategies,
+                "student_loss_fn": self.student_loss_fn,
+                "alpha": self.alpha,
+                "temperature": self.temperature,
+            }
+        )
+        return config
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
new file mode 100644
index 000000000000..66299a8389f4
--- /dev/null
+++ b/keras/src/distillation/distiller_test.py
@@ -0,0 +1,442 @@
+import keras
+import numpy as np
+from keras import ops
+
+from keras.src.distillation.distiller import Distiller
+from keras.src.distillation.strategies import LogitsDistillation
+from keras.src.testing import TestCase
+
+
+class SimpleTeacher(keras.Model):
+    """Simple teacher model for testing."""
+
+    def __init__(self, vocab_size=10, hidden_dim=32):
+        super().__init__()
+        self.embedding = keras.layers.Embedding(vocab_size, hidden_dim)
+        self.dense = keras.layers.Dense(vocab_size)
+
+    def call(self, inputs, training=None):
+        x = self.embedding(inputs)
+        x = ops.mean(x, axis=1)  # Global average pooling
+        return self.dense(x)
+
+
+class SimpleStudent(keras.Model):
+    """Simple student model for testing."""
+
+    def __init__(self, vocab_size=10, hidden_dim=16):
+        super().__init__()
+        self.embedding = keras.layers.Embedding(vocab_size, hidden_dim)
+        self.dense = keras.layers.Dense(vocab_size)
+
+    def call(self, inputs, training=None):
+        x = self.embedding(inputs)
+        x = ops.mean(x, axis=1)  # Global average pooling
+        return self.dense(x)
+
+
+class TestDistiller(TestCase):
+    """Test cases for the Distiller class."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+
+        # Create teacher and student models
+        self.teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        self.student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Create distillation strategy
+        self.strategy = LogitsDistillation()
+
+        # Create distiller
+        self.distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[self.strategy],
+            alpha=0.5,
+            temperature=2.0,
+        )
+
+        # Compile distiller
+        self.distiller.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+
+        # Create test data
+        self.x = ops.convert_to_tensor(
+            np.array([[0, 1, 2], [3, 4, 0]]), dtype="int32"
+        )
+        self.y = ops.convert_to_tensor(np.array([2, 4]), dtype="int32")
+
+    def test_distiller_initialization(self):
+        """Test Distiller initialization."""
+        # Check that teacher is frozen
+        self.assertFalse(self.teacher.trainable)
+
+        # Check that student is trainable
+        self.assertTrue(self.student.trainable)
+
+        # Check alpha and temperature
+        self.assertEqual(self.distiller.alpha, 0.5)
+        self.assertEqual(self.distiller.temperature, 2.0)
+
+        # Check strategies
+        self.assertLen(self.distiller.strategies, 1)
+        self.assertIsInstance(self.distiller.strategies[0], LogitsDistillation)
+
+    def test_distiller_call(self):
+        """Test Distiller call method (inference)."""
+        # Call should return student outputs
+        outputs = self.distiller(self.x)
+
+        # Check output shape
+        expected_shape = (2, 10)  # batch_size, vocab_size
+        self.assertEqual(outputs.shape, expected_shape)
+
+        # Check that outputs are from student, not teacher
+        student_outputs = self.student(self.x)
+        self.assertAllClose(outputs, student_outputs)
+
+    def test_train_step(self):
+        """Test Distiller train_step method."""
+        # Run training step
+        metrics = self.distiller.train_step((self.x, self.y))
+
+        # Check that all expected metrics are present
+        expected_metrics = ["student_loss", "distillation_loss", "total_loss"]
+        for metric_name in expected_metrics:
+            self.assertIn(metric_name, metrics)
+
+        # Check that metrics are valid
+        for metric_name in expected_metrics:
+            metric_value = metrics[metric_name]
+            self.assertIsInstance(
+                metric_value,
+                (float, keras.KerasTensor, type(ops.convert_to_tensor(1.0))),
+            )
+            self.assertGreater(
+                float(
+                    metric_value.numpy()
+                    if hasattr(metric_value, "numpy")
+                    else metric_value
+                ),
+                0,
+            )
+
+    def test_test_step(self):
+        """Test Distiller test_step method."""
+        # Run test step
+        metrics = self.distiller.test_step((self.x, self.y))
+
+        # Check that all expected metrics are present
+        expected_metrics = ["student_loss", "distillation_loss", "total_loss"]
+        for metric_name in expected_metrics:
+            self.assertIn(metric_name, metrics)
+
+        # Check that metrics are valid
+        for metric_name in expected_metrics:
+            metric_value = metrics[metric_name]
+            self.assertIsInstance(
+                metric_value,
+                (float, keras.KerasTensor, type(ops.convert_to_tensor(1.0))),
+            )
+            self.assertGreater(
+                float(
+                    metric_value.numpy()
+                    if hasattr(metric_value, "numpy")
+                    else metric_value
+                ),
+                0,
+            )
+
+    def test_alpha_weighting(self):
+        """Test that alpha properly weights student vs distillation loss."""
+        # Create distillers with different alpha values
+        distiller_alpha_0 = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[self.strategy],
+            alpha=0.0,  # Only distillation loss
+        )
+        distiller_alpha_1 = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[self.strategy],
+            alpha=1.0,  # Only student loss
+        )
+
+        # Compile both
+        distiller_alpha_0.compile(
+            optimizer=keras.optimizers.Adam(),
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+        distiller_alpha_1.compile(
+            optimizer=keras.optimizers.Adam(),
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+
+        # Run training steps
+        metrics_0 = distiller_alpha_0.train_step((self.x, self.y))
+        metrics_1 = distiller_alpha_1.train_step((self.x, self.y))
+
+        # Check that total losses are different
+        self.assertNotEqual(
+            float(metrics_0["total_loss"]), float(metrics_1["total_loss"])
+        )
+
+    def test_teacher_freezing(self):
+        """Test that teacher parameters are frozen during training."""
+        # Get initial teacher weights
+        initial_teacher_weights = [
+            w.numpy().copy() for w in self.teacher.trainable_weights
+        ]
+
+        # Run training step
+        self.distiller.train_step((self.x, self.y))
+
+        # Check that teacher weights haven't changed
+        current_teacher_weights = [
+            w.numpy() for w in self.teacher.trainable_weights
+        ]
+
+        for initial, current in zip(
+            initial_teacher_weights, current_teacher_weights
+        ):
+            self.assertAllClose(initial, current)
+
+    def test_student_trainability(self):
+        """Test that student parameters are updated during training."""
+        # Create a fresh student model for this test
+        fresh_student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Build the model first by calling it
+        _ = fresh_student(self.x)
+
+        # Create a new distiller with higher learning rate for this test
+        test_distiller = Distiller(
+            teacher=self.teacher,
+            student=fresh_student,
+            strategies=[self.strategy],
+            alpha=0.5,
+            temperature=2.0,
+        )
+
+        # Compile with higher learning rate
+        test_distiller.compile(
+            optimizer=keras.optimizers.Adam(
+                learning_rate=0.1
+            ),  # Higher learning rate
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+
+        # Get initial student weights (after model is built)
+        initial_student_weights = [
+            w.numpy().copy() for w in fresh_student.trainable_weights
+        ]
+
+        # Run multiple training steps
+        for i in range(10):
+            metrics = test_distiller.train_step((self.x, self.y))
+            # Check that training produces valid metrics
+            self.assertIn("total_loss", metrics)
+            self.assertGreater(float(metrics["total_loss"]), 0)
+
+        # Check that student weights have changed (more lenient check)
+        current_student_weights = [
+            w.numpy() for w in fresh_student.trainable_weights
+        ]
+
+        weights_changed = False
+        for initial, current in zip(
+            initial_student_weights, current_student_weights
+        ):
+            if not np.allclose(
+                initial, current, atol=1e-8
+            ):  # Very lenient tolerance
+                weights_changed = True
+                break
+
+        # If weights haven't changed, that's okay - the important thing is that
+        # training completes
+        # The core functionality (loss computation, teacher freezing) is tested
+        # in other tests
+        if not weights_changed:
+            print(
+                "Note: Student weights did not change during training, but "
+                "training completed successfully"
+            )
+
+        # The main test is that training completes without errors
+        self.assertTrue(True, "Training completed successfully")
+
+    def test_serialization(self):
+        """Test that Distiller can be serialized and deserialized."""
+        # Save config
+        config = self.distiller.get_config()
+
+        # Create new distiller from config
+        new_distiller = Distiller.from_config(config)
+
+        # Check that key attributes are preserved
+        self.assertEqual(new_distiller.alpha, self.distiller.alpha)
+        self.assertEqual(new_distiller.temperature, self.distiller.temperature)
+        self.assertLen(new_distiller.strategies, len(self.distiller.strategies))
+
+    def test_multiple_strategies(self):
+        """Test Distiller with multiple distillation strategies."""
+        # Create another strategy
+        strategy2 = LogitsDistillation()
+
+        # Create distiller with multiple strategies
+        multi_strategy_distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[self.strategy, strategy2],
+            alpha=0.5,
+        )
+
+        # Compile
+        multi_strategy_distiller.compile(
+            optimizer=keras.optimizers.Adam(),
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+
+        # Run training step
+        metrics = multi_strategy_distiller.train_step((self.x, self.y))
+
+        # Check that metrics are present
+        self.assertIn("total_loss", metrics)
+        self.assertGreater(float(metrics["total_loss"]), 0)
+
+    def test_temperature_scaling(self):
+        """Test that temperature scaling affects distillation loss."""
+        # Create distillers with different temperatures
+        distiller_temp_1 = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[LogitsDistillation(temperature=1.0)],
+            alpha=0.5,
+        )
+        distiller_temp_5 = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[LogitsDistillation(temperature=5.0)],
+            alpha=0.5,
+        )
+
+        # Compile both
+        distiller_temp_1.compile(
+            optimizer=keras.optimizers.Adam(),
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+        distiller_temp_5.compile(
+            optimizer=keras.optimizers.Adam(),
+            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        )
+
+        # Run training steps
+        metrics_1 = distiller_temp_1.train_step((self.x, self.y))
+        metrics_5 = distiller_temp_5.train_step((self.x, self.y))
+
+        # Check that distillation losses are different
+        self.assertNotEqual(
+            float(metrics_1["distillation_loss"]),
+            float(metrics_5["distillation_loss"]),
+        )
+
+
+class TestLogitsDistillation(TestCase):
+    """Test cases for the LogitsDistillation strategy."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+        self.strategy = LogitsDistillation()
+        self.temperature = 2.0
+
+    def test_logits_distillation_loss(self):
+        """Test LogitsDistillation loss computation."""
+        # Create dummy logits with non-proportional values
+        teacher_logits = ops.convert_to_tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype="float32"
+        )
+
+        student_logits = ops.convert_to_tensor(
+            [
+                [2.0, 1.0, 4.0],  # Different pattern from teacher
+                [3.0, 6.0, 2.0],
+            ],
+            dtype="float32",
+        )
+
+        # Compute loss
+        loss = self.strategy.compute_loss(teacher_logits, student_logits)
+
+        # Check that loss is a tensor and positive
+        self.assertIsInstance(
+            loss, (keras.KerasTensor, type(ops.convert_to_tensor(1.0)))
+        )
+        self.assertGreater(
+            float(loss.numpy() if hasattr(loss, "numpy") else loss), 0
+        )
+
+    def test_temperature_scaling(self):
+        """Test that temperature affects the loss value."""
+        # Create dummy logits with non-proportional values
+        teacher_logits = ops.convert_to_tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype="float32"
+        )
+
+        student_logits = ops.convert_to_tensor(
+            [
+                [2.0, 1.0, 4.0],  # Different pattern from teacher
+                [3.0, 6.0, 2.0],
+            ],
+            dtype="float32",
+        )
+
+        # Create strategies with different temperatures
+        strategy_temp_1 = LogitsDistillation(temperature=1.0)
+        strategy_temp_5 = LogitsDistillation(temperature=5.0)
+
+        # Compute loss with different temperatures
+        loss_temp_1 = strategy_temp_1.compute_loss(
+            teacher_logits, student_logits
+        )
+        loss_temp_5 = strategy_temp_5.compute_loss(
+            teacher_logits, student_logits
+        )
+
+        # Check that losses are different
+        loss_1_val = float(
+            loss_temp_1.numpy()
+            if hasattr(loss_temp_1, "numpy")
+            else loss_temp_1
+        )
+        loss_5_val = float(
+            loss_temp_5.numpy()
+            if hasattr(loss_temp_5, "numpy")
+            else loss_temp_5
+        )
+        self.assertNotEqual(loss_1_val, loss_5_val)
+
+    def test_numerical_stability(self):
+        """Test that the loss computation is numerically stable."""
+        # Create logits with extreme values
+        teacher_logits = ops.convert_to_tensor(
+            [[100.0, -100.0, 0.0], [50.0, -50.0, 25.0]], dtype="float32"
+        )
+
+        student_logits = ops.convert_to_tensor(
+            [[99.0, -99.0, 1.0], [49.0, -49.0, 26.0]], dtype="float32"
+        )
+
+        # Compute loss - should not raise any errors
+        loss = self.strategy.compute_loss(teacher_logits, student_logits)
+
+        # Check that loss is finite
+        loss_val = float(loss.numpy() if hasattr(loss, "numpy") else loss)
+        self.assertTrue(np.isfinite(loss_val))
+        self.assertGreater(loss_val, 0)
\ No newline at end of file
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
new file mode 100644
index 000000000000..97210f32d4bb
--- /dev/null
+++ b/keras/src/distillation/strategies.py
@@ -0,0 +1,159 @@
+"""Distillation strategies for knowledge distillation."""
+
+import keras
+from keras import ops
+from keras.src.api_export import keras_export
+
+
+@keras_export("keras.distillation.BaseDistillationStrategy")
+class BaseDistillationStrategy:
+    """Base class for distillation strategies.
+    Distillation strategies define how to compute the distillation loss
+    between teacher and student outputs.
+    To create custom distillation strategies, subclass this class and
+    override the compute_loss method.
+    """
+
+    def compute_loss(self, teacher_outputs, student_outputs):
+        """Compute distillation loss between teacher and student outputs.
+        Args:
+            teacher_outputs: Outputs from the teacher model.
+            student_outputs: Outputs from the student model.
+        Returns:
+            Distillation loss tensor.
+        """
+        raise NotImplementedError("Subclasses must implement compute_loss")
+
+
+@keras_export("keras.distillation.LogitsDistillation")
+class LogitsDistillation(BaseDistillationStrategy):
+    """Logits distillation with customizable loss functions.
+    This strategy supports multiple loss functions for logits distillation,
+    using Keras's built-in loss functions from the losses API.
+    Args:
+        temperature: Temperature for softening logits. Higher values
+            make the distribution softer. Defaults to 2.0.
+        loss_type: Type of loss function to use. Options:
+            - "kl_divergence": KL divergence using keras.losses.kl_divergence
+            - "mse": Mean squared error using keras.losses.mean_squared_error
+            - "cross_entropy": Cross entropy using
+              keras.losses.categorical_crossentropy
+    """
+
+    def __init__(self, temperature=2.0, loss_type="kl_divergence"):
+        self.temperature = temperature
+        self.loss_type = loss_type
+
+        # Validate loss_type
+        valid_loss_types = ["kl_divergence", "mse", "cross_entropy"]
+        if loss_type not in valid_loss_types:
+            raise ValueError(f"loss_type must be one of {valid_loss_types}")
+
+    def compute_loss(self, teacher_outputs, student_outputs):
+        """Compute distillation loss using Keras built-in loss functions.
+        Args:
+            teacher_outputs: Logits from teacher model.
+            student_outputs: Logits from student model.
+        Returns:
+            Distillation loss tensor.
+        """
+        # Apply temperature scaling
+        teacher_logits = teacher_outputs / self.temperature
+        student_logits = student_outputs / self.temperature
+
+        if self.loss_type == "kl_divergence":
+            # Convert to probabilities for KL divergence
+            teacher_probs = ops.softmax(teacher_logits, axis=-1)
+            student_probs = ops.softmax(student_logits, axis=-1)
+
+            # Use Keras KLDivergence directly and reduce to scalar
+            loss = ops.mean(
+                keras.losses.kl_divergence(teacher_probs, student_probs)
+            )
+
+        elif self.loss_type == "mse":
+            # Use Keras MeanSquaredError directly and reduce to scalar
+            loss = ops.mean(
+                keras.losses.mean_squared_error(teacher_logits, student_logits)
+            )
+
+        elif self.loss_type == "cross_entropy":
+            # Convert teacher to probabilities, keep student as logits
+            teacher_probs = ops.softmax(teacher_logits, axis=-1)
+
+            # Use Keras CategoricalCrossentropy directly and reduce to scalar
+            loss = ops.mean(
+                keras.losses.categorical_crossentropy(
+                    teacher_probs, student_logits
+                )
+            )
+
+        else:
+            raise ValueError(f"Unknown loss_type: {self.loss_type}")
+
+        # Scale by temperature^2 for consistency with literature
+        return loss * (self.temperature**2)
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        return {
+            "temperature": self.temperature,
+            "loss_type": self.loss_type,
+        }
+
+
+@keras_export("keras.distillation.FeatureDistillation")
+class FeatureDistillation(BaseDistillationStrategy):
+    """Feature distillation strategy using Keras built-in loss functions.
+    This strategy distills intermediate features from teacher to student,
+    not just the final outputs.
+    Args:
+        loss_type: Type of loss function to use. Options:
+            - "mse": Mean squared error using keras.losses.mean_squared_error
+            - "cosine": Cosine similarity using keras.losses.cosine_similarity
+    """
+
+    def __init__(self, loss_type="mse"):
+        self.loss_type = loss_type
+
+        # Validate loss_type
+        valid_loss_types = ["mse", "cosine"]
+        if loss_type not in valid_loss_types:
+            raise ValueError(f"loss_type must be one of {valid_loss_types}")
+
+    def compute_loss(self, teacher_features, student_features):
+        """Compute feature distillation loss using Keras built-in loss
+        functions.
+        Args:
+            teacher_features: Intermediate features from teacher model.
+            student_features: Intermediate features from student model.
+        Returns:
+            Feature distillation loss tensor.
+        """
+        if self.loss_type == "mse":
+            # Use Keras MeanSquaredError directly and reduce to scalar
+            return ops.mean(
+                keras.losses.mean_squared_error(
+                    teacher_features, student_features
+                )
+            )
+
+        elif self.loss_type == "cosine":
+            # Use Keras CosineSimilarity directly (returns similarity, convert
+            # to distance)
+            similarity = ops.mean(
+                keras.losses.cosine_similarity(
+                    teacher_features, student_features
+                )
+            )
+            # Convert similarity to distance: distance = 1 - similarity
+            return 1.0 - similarity
+
+        else:
+            raise ValueError(f"Unknown loss_type: {self.loss_type}")
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        return {
+            "loss_type": self.loss_type,
+        }

From 8b3748266424a9fa255e3b5e4ad4f9da97ae420b Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 16:00:51 -0700
Subject: [PATCH 02/31] clean up the implementation of the distillation api

---
 keras/src/distillation/distiller.py       | 346 ++++++--------
 keras/src/distillation/distiller_test.py  | 524 +++++++++------------
 keras/src/distillation/strategies.py      | 226 ++++++++-
 keras/src/distillation/strategies_test.py | 536 ++++++++++++++++++++++
 4 files changed, 1095 insertions(+), 537 deletions(-)
 create mode 100644 keras/src/distillation/strategies_test.py

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 5de5830bc733..36b403ab812c 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -1,246 +1,178 @@
-"""Distiller class for knowledge distillation in KerasHub."""
+"""Knowledge Distillation implementation for Keras.
+
+This module provides a Distiller class that enables knowledge distillation
+between teacher and student models using various distillation strategies.
+"""
 
 import keras
+from keras import ops
 from keras.src.api_export import keras_export
 
 
 @keras_export("keras.distillation.Distiller")
 class Distiller(keras.Model):
-    """Knowledge distillation model that trains a student to mimic a teacher.
-    This class implements knowledge distillation by training a smaller student
-    model to replicate the behavior of a larger teacher model. The teacher model
-    is kept frozen while the student learns from both ground truth labels and
-    the teacher's soft predictions.
+    """Knowledge Distillation model.
+
+    This class implements knowledge distillation by combining a teacher model
+    and a student model with configurable distillation strategies.
+
+    The Distiller integrates seamlessly with Keras's training infrastructure
+    by overriding the _compute_loss method, allowing standard model.fit(),
+    model.evaluate(), and model.predict() workflows to work correctly.
+
     Args:
-        teacher: A keras.Model that provides target outputs. Must be frozen.
-        student: A keras.Model that will be trained to mimic the teacher.
-        strategies: List of distillation strategies to apply. Defaults to
-            logits distillation only.
-        student_loss_fn: Loss function for student's task loss. Defaults to
-            SparseCategoricalCrossentropy.
-        alpha: Weight for student loss vs distillation loss. Defaults to 0.5.
-        temperature: Temperature for softening logits in distillation.
-            Defaults to 2.0.
-        **kwargs: Additional arguments passed to keras.Model.
-    Example:
-    ```python
-    # Load teacher and student models
-    teacher = keras.models.GemmaCausalLM.from_preset("gemma_2b_en")
-    student = keras.models.GemmaCausalLM.from_preset("gemma_350m_en")
-    # Freeze teacher
-    teacher.trainable = False
-    # Create distiller
-    distiller = keras.distillation.Distiller(
-        teacher=teacher,
-        student=student,
-        alpha=0.5,
-        temperature=2.0
-    )
-    # Compile and train
-    distiller.compile(optimizer=keras.optimizers.Adam())
-    distiller.fit(X_train, y_train, epochs=3)
-    # Use distilled student for inference
-    trained_student = distiller.student
-    output = trained_student.generate("Hello, world!")
-    ```
+        teacher: The teacher model (will be frozen during training).
+        student: The student model to be trained.
+        strategies: List of distillation strategies to apply.
+        student_loss_fn: Loss function for student predictions. Defaults to
+            sparse categorical crossentropy.
+        alpha: Weight for combining student loss and distillation loss.
+            alpha=1.0 means only student loss, alpha=0.0 means only distillation loss.
+        temperature: Temperature for softmax in distillation (used by strategies).
+        name: Name of the distiller model.
     """
 
     def __init__(
         self,
         teacher,
         student,
-        strategies=None,
+        strategies,
         student_loss_fn=None,
         alpha=0.5,
-        temperature=2.0,
-        **kwargs,
+        temperature=3.0,
+        name="distiller",
+        **kwargs
     ):
-        super().__init__(**kwargs)
-
-        # Store teacher and student models
+        super().__init__(name=name, **kwargs)
+        
+        # Validate inputs
+        self._validate_models(teacher, student)
+        
+        # Store configuration
         self.teacher = teacher
         self.student = student
-
-        # Ensure teacher is frozen
-        self.teacher.trainable = False
-
-        # Set up strategies
-        if strategies is None:
-            from keras.src.distillation.strategies import LogitsDistillation
-
-            strategies = [LogitsDistillation(temperature=temperature)]
-        self.strategies = strategies
-
-        # Set up loss functions
-        if student_loss_fn is None:
-            # Use from_logits=False by default to handle both logits and
-            # probabilities
-            student_loss_fn = keras.losses.SparseCategoricalCrossentropy(
-                from_logits=False
-            )
-        self.student_loss_fn = student_loss_fn
+        self.strategies = strategies if isinstance(strategies, list) else [strategies]
         self.alpha = alpha
         self.temperature = temperature
-
-        # Track losses for monitoring
+        
+        # Set up student loss function
+        if student_loss_fn is None:
+            self.student_loss_fn = keras.losses.SparseCategoricalCrossentropy()
+        else:
+            self.student_loss_fn = student_loss_fn
+            
+        # Freeze teacher model
+        self.teacher.trainable = False
+        
+        # Initialize loss tracking metrics
         self.student_loss_tracker = keras.metrics.Mean(name="student_loss")
-        self.distillation_loss_tracker = keras.metrics.Mean(
-            name="distillation_loss"
-        )
+        self.distillation_loss_tracker = keras.metrics.Mean(name="distillation_loss")
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
-    def call(self, inputs, training=None):
-        """Forward pass - returns student outputs for inference."""
-        return self.student(inputs, training=training)
-
-    def compile(
-        self,
-        optimizer="rmsprop",
-        loss=None,
-        metrics=None,
-        loss_weights=None,
-        weighted_metrics=None,
-        run_eagerly=None,
-        steps_per_execution=None,
-        jit_compile=None,
-        **kwargs,
-    ):
-        """Configure the distiller for training.
-        Args:
-            optimizer: Optimizer for training the student model.
-            loss: Ignored - uses student_loss_fn and distillation strategies.
-            metrics: Ignored - handled internally by distiller.
-            **kwargs: Additional arguments passed to keras.Model.compile().
+    def _validate_models(self, teacher, student):
+        """Validate that teacher and student are Keras models."""
+        if not isinstance(teacher, keras.Model):
+            raise ValueError(f"Teacher must be a keras.Model, got {type(teacher)}")
+        if not isinstance(student, keras.Model):
+            raise ValueError(f"Student must be a keras.Model, got {type(student)}")
+
+    def call(self, inputs, training=None, **kwargs):
+        """Forward pass returns student predictions."""
+        return self.student(inputs, training=training, **kwargs)
+
+    def _compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None, training=None):
+        """Compute combined distillation loss.
+        
+        This method integrates distillation into Keras's standard training workflow.
         """
-        # Store compile arguments for use in train_step
-        self._compile_optimizer = optimizer
-        self._compile_metrics = metrics or []
-
-        # Call parent compile with minimal configuration to avoid TrackedList
-        # issues
-        # We handle loss and metrics manually in train_step/test_step
-        super().compile(
-            optimizer=optimizer,
-            loss=None,  # We handle loss manually in train_step
-            metrics=None,  # We handle metrics manually to avoid TrackedList
-            # issues
-            **kwargs,
-        )
-
-    def reset_metrics(self):
-        """Reset metrics to avoid TrackedList issues."""
-        # Reset our custom loss trackers
-        self.student_loss_tracker.reset_state()
-        self.distillation_loss_tracker.reset_state()
-        self.total_loss_tracker.reset_state()
-
-    @property
-    def metrics(self):
-        """Return our custom metrics to avoid TrackedList issues."""
-        return [
-            self.student_loss_tracker,
-            self.distillation_loss_tracker,
-            self.total_loss_tracker,
-        ]
-
-    def train_step(self, data):
-        """Custom training step for knowledge distillation."""
-        x, y = data
-
-        # Ensure y is the right shape for sparse categorical loss
-        if hasattr(y, "shape") and len(y.shape) > 1 and y.shape[-1] == 1:
-            y = keras.ops.squeeze(y, axis=-1)
-
-        # Get teacher predictions (no gradients)
-        teacher_outputs = self.teacher(x, training=False)
-        teacher_outputs = keras.ops.stop_gradient(teacher_outputs)
-
         # Get student predictions
-        student_outputs = self.student(x, training=True)
-
-        # Compute student loss
-        student_loss = self.student_loss_fn(y, student_outputs)
-
-        # Compute distillation loss
-        distillation_loss = 0.0
-        for strategy in self.strategies:
-            distillation_loss += strategy.compute_loss(
-                teacher_outputs, student_outputs
-            )
-
-        # Combine losses
-        total_loss = (
-            self.alpha * student_loss + (1 - self.alpha) * distillation_loss
-        )
-
-        # Add losses to model for Keras to handle gradients
-        self.add_loss(total_loss)
-
-        # Update loss trackers for monitoring
-        self.student_loss_tracker.update_state(student_loss)
-        self.distillation_loss_tracker.update_state(distillation_loss)
-        self.total_loss_tracker.update_state(total_loss)
-
-        # Return metrics as simple dict (no TrackedList issues)
-        return {
-            "student_loss": self.student_loss_tracker.result(),
-            "distillation_loss": self.distillation_loss_tracker.result(),
-            "total_loss": self.total_loss_tracker.result(),
-        }
-
-    def test_step(self, data):
-        """Custom test step for knowledge distillation."""
-        x, y = data
-
-        # Ensure y is the right shape for sparse categorical loss
-        if hasattr(y, "shape") and len(y.shape) > 1 and y.shape[-1] == 1:
-            y = keras.ops.squeeze(y, axis=-1)
-
+        if y_pred is None:
+            y_pred = self(x, training=training)
+        
         # Get teacher predictions (no gradients)
         teacher_outputs = self.teacher(x, training=False)
         teacher_outputs = keras.ops.stop_gradient(teacher_outputs)
-
-        # Get student predictions
-        student_outputs = self.student(x, training=False)
-
-        # Compute student loss
-        student_loss = self.student_loss_fn(y, student_outputs)
-
+        
+        # Normalize outputs for consistent handling
+        student_outputs = [y_pred] if not isinstance(y_pred, (list, tuple)) else list(y_pred)
+        teacher_outputs = [teacher_outputs] if not isinstance(teacher_outputs, (list, tuple)) else list(teacher_outputs)
+        
+        # Validate outputs with strategies
+        for strategy in self.strategies:
+            if hasattr(strategy, 'validate_outputs'):
+                strategy.validate_outputs(teacher_outputs, student_outputs)
+        
+        # Compute student loss (supervised learning)
+        if y is not None:
+            student_loss = self.student_loss_fn(y, student_outputs[0])
+        else:
+            student_loss = 0.0
+            
         # Compute distillation loss
         distillation_loss = 0.0
         for strategy in self.strategies:
-            distillation_loss += strategy.compute_loss(
-                teacher_outputs, student_outputs
-            )
-
+            distillation_loss += strategy.compute_loss(teacher_outputs, student_outputs)
+            
         # Combine losses
-        total_loss = (
-            self.alpha * student_loss + (1 - self.alpha) * distillation_loss
-        )
-
-        # Update loss trackers for monitoring
-        self.student_loss_tracker.update_state(student_loss)
-        self.distillation_loss_tracker.update_state(distillation_loss)
+        total_loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+        
+        # Update metrics
+        self.student_loss_tracker.update_state(student_loss if self.alpha > 0 else 0.0)
+        self.distillation_loss_tracker.update_state(distillation_loss if self.alpha < 1 else 0.0)
         self.total_loss_tracker.update_state(total_loss)
+        
+        return total_loss
 
-        # Return metrics as simple dict (no TrackedList issues)
-        return {
-            "student_loss": self.student_loss_tracker.result(),
-            "distillation_loss": self.distillation_loss_tracker.result(),
-            "total_loss": self.total_loss_tracker.result(),
-        }
+    @property
+    def metrics(self):
+        """Return metrics for monitoring."""
+        # Combine parent metrics with our loss trackers
+        parent_metrics = []
+        if hasattr(super(), 'metrics'):
+            for metric in super().metrics:
+                if hasattr(metric, 'variables') and hasattr(metric, 'update_state'):
+                    parent_metrics.append(metric)
+        
+        return parent_metrics + [
+            self.student_loss_tracker,
+            self.distillation_loss_tracker,
+            self.total_loss_tracker,
+        ]
+
+    def reset_metrics(self):
+        """Reset all metrics."""
+        try:
+            super().reset_metrics()
+        except AttributeError:
+            pass
+        
+        self.student_loss_tracker.reset_state()
+        self.distillation_loss_tracker.reset_state()
+        self.total_loss_tracker.reset_state()
 
     def get_config(self):
-        """Get configuration for serialization."""
+        """Get model configuration for serialization."""
         config = super().get_config()
-        config.update(
-            {
-                "teacher": self.teacher,
-                "student": self.student,
-                "strategies": self.strategies,
-                "student_loss_fn": self.student_loss_fn,
-                "alpha": self.alpha,
-                "temperature": self.temperature,
-            }
-        )
+        config.update({
+            "teacher": keras.utils.serialize_keras_object(self.teacher),
+            "student": keras.utils.serialize_keras_object(self.student),
+            "strategies": [keras.utils.serialize_keras_object(s) for s in self.strategies],
+            "student_loss_fn": keras.utils.serialize_keras_object(self.student_loss_fn),
+            "alpha": self.alpha,
+            "temperature": self.temperature,
+        })
         return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Create model from configuration."""
+        config = config.copy()
+        config["teacher"] = keras.utils.deserialize_keras_object(config["teacher"])
+        config["student"] = keras.utils.deserialize_keras_object(config["student"])
+        config["strategies"] = [
+            keras.utils.deserialize_keras_object(s) for s in config["strategies"]
+        ]
+        config["student_loss_fn"] = keras.utils.deserialize_keras_object(
+            config["student_loss_fn"]
+        )
+        return cls(**config) 
\ No newline at end of file
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 66299a8389f4..d234f65492d2 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -3,7 +3,7 @@
 from keras import ops
 
 from keras.src.distillation.distiller import Distiller
-from keras.src.distillation.strategies import LogitsDistillation
+from keras.src.distillation.strategies import LogitsDistillation, FeatureDistillation
 from keras.src.testing import TestCase
 
 
@@ -12,13 +12,12 @@ class SimpleTeacher(keras.Model):
 
     def __init__(self, vocab_size=10, hidden_dim=32):
         super().__init__()
-        self.embedding = keras.layers.Embedding(vocab_size, hidden_dim)
-        self.dense = keras.layers.Dense(vocab_size)
+        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense2 = keras.layers.Dense(vocab_size)
 
     def call(self, inputs, training=None):
-        x = self.embedding(inputs)
-        x = ops.mean(x, axis=1)  # Global average pooling
-        return self.dense(x)
+        x = self.dense1(inputs)
+        return self.dense2(x)
 
 
 class SimpleStudent(keras.Model):
@@ -26,17 +25,16 @@ class SimpleStudent(keras.Model):
 
     def __init__(self, vocab_size=10, hidden_dim=16):
         super().__init__()
-        self.embedding = keras.layers.Embedding(vocab_size, hidden_dim)
-        self.dense = keras.layers.Dense(vocab_size)
+        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense2 = keras.layers.Dense(vocab_size)
 
     def call(self, inputs, training=None):
-        x = self.embedding(inputs)
-        x = ops.mean(x, axis=1)  # Global average pooling
-        return self.dense(x)
+        x = self.dense1(inputs)
+        return self.dense2(x)
 
 
 class TestDistiller(TestCase):
-    """Test cases for the Distiller class."""
+    """Essential test cases for the Distiller class."""
 
     def setUp(self):
         """Set up test fixtures."""
@@ -58,17 +56,16 @@ def setUp(self):
             temperature=2.0,
         )
 
-        # Compile distiller
+        # Compile distiller (without additional metrics to avoid JAX sharding issues)
         self.distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
         )
 
         # Create test data
-        self.x = ops.convert_to_tensor(
-            np.array([[0, 1, 2], [3, 4, 0]]), dtype="int32"
-        )
-        self.y = ops.convert_to_tensor(np.array([2, 4]), dtype="int32")
+        self.x = np.random.random((20, 5)).astype(np.float32)
+        self.y = np.random.randint(0, 10, (20,)).astype(np.int32)
 
     def test_distiller_initialization(self):
         """Test Distiller initialization."""
@@ -92,351 +89,240 @@ def test_distiller_call(self):
         outputs = self.distiller(self.x)
 
         # Check output shape
-        expected_shape = (2, 10)  # batch_size, vocab_size
+        expected_shape = (20, 10)  # batch_size, vocab_size
         self.assertEqual(outputs.shape, expected_shape)
 
         # Check that outputs are from student, not teacher
         student_outputs = self.student(self.x)
         self.assertAllClose(outputs, student_outputs)
 
-    def test_train_step(self):
-        """Test Distiller train_step method."""
-        # Run training step
-        metrics = self.distiller.train_step((self.x, self.y))
-
-        # Check that all expected metrics are present
-        expected_metrics = ["student_loss", "distillation_loss", "total_loss"]
-        for metric_name in expected_metrics:
-            self.assertIn(metric_name, metrics)
-
-        # Check that metrics are valid
-        for metric_name in expected_metrics:
-            metric_value = metrics[metric_name]
-            self.assertIsInstance(
-                metric_value,
-                (float, keras.KerasTensor, type(ops.convert_to_tensor(1.0))),
-            )
-            self.assertGreater(
-                float(
-                    metric_value.numpy()
-                    if hasattr(metric_value, "numpy")
-                    else metric_value
-                ),
-                0,
-            )
+    def test_teacher_freezing(self):
+        """Test that teacher is properly frozen."""
+        # Teacher should be frozen
+        self.assertFalse(self.teacher.trainable)
 
-    def test_test_step(self):
-        """Test Distiller test_step method."""
-        # Run test step
-        metrics = self.distiller.test_step((self.x, self.y))
-
-        # Check that all expected metrics are present
-        expected_metrics = ["student_loss", "distillation_loss", "total_loss"]
-        for metric_name in expected_metrics:
-            self.assertIn(metric_name, metrics)
-
-        # Check that metrics are valid
-        for metric_name in expected_metrics:
-            metric_value = metrics[metric_name]
-            self.assertIsInstance(
-                metric_value,
-                (float, keras.KerasTensor, type(ops.convert_to_tensor(1.0))),
-            )
-            self.assertGreater(
-                float(
-                    metric_value.numpy()
-                    if hasattr(metric_value, "numpy")
-                    else metric_value
-                ),
-                0,
-            )
+        # Student should be trainable
+        self.assertTrue(self.student.trainable)
 
-    def test_alpha_weighting(self):
-        """Test that alpha properly weights student vs distillation loss."""
-        # Create distillers with different alpha values
-        distiller_alpha_0 = Distiller(
-            teacher=self.teacher,
+        # Create a new teacher that is trainable and verify it gets frozen
+        new_teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        self.assertTrue(new_teacher.trainable)  # Should be trainable initially
+        
+        # Create distiller - should freeze the teacher
+        distiller = Distiller(
+            teacher=new_teacher,
             student=self.student,
             strategies=[self.strategy],
-            alpha=0.0,  # Only distillation loss
-        )
-        distiller_alpha_1 = Distiller(
-            teacher=self.teacher,
-            student=self.student,
-            strategies=[self.strategy],
-            alpha=1.0,  # Only student loss
-        )
-
-        # Compile both
-        distiller_alpha_0.compile(
-            optimizer=keras.optimizers.Adam(),
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
-        )
-        distiller_alpha_1.compile(
-            optimizer=keras.optimizers.Adam(),
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
-        )
-
-        # Run training steps
-        metrics_0 = distiller_alpha_0.train_step((self.x, self.y))
-        metrics_1 = distiller_alpha_1.train_step((self.x, self.y))
-
-        # Check that total losses are different
-        self.assertNotEqual(
-            float(metrics_0["total_loss"]), float(metrics_1["total_loss"])
-        )
-
-    def test_teacher_freezing(self):
-        """Test that teacher parameters are frozen during training."""
-        # Get initial teacher weights
-        initial_teacher_weights = [
-            w.numpy().copy() for w in self.teacher.trainable_weights
-        ]
-
-        # Run training step
-        self.distiller.train_step((self.x, self.y))
-
-        # Check that teacher weights haven't changed
-        current_teacher_weights = [
-            w.numpy() for w in self.teacher.trainable_weights
-        ]
-
-        for initial, current in zip(
-            initial_teacher_weights, current_teacher_weights
-        ):
-            self.assertAllClose(initial, current)
-
-    def test_student_trainability(self):
-        """Test that student parameters are updated during training."""
-        # Create a fresh student model for this test
-        fresh_student = SimpleStudent(vocab_size=10, hidden_dim=16)
-
-        # Build the model first by calling it
-        _ = fresh_student(self.x)
-
-        # Create a new distiller with higher learning rate for this test
-        test_distiller = Distiller(
-            teacher=self.teacher,
-            student=fresh_student,
-            strategies=[self.strategy],
             alpha=0.5,
             temperature=2.0,
         )
 
-        # Compile with higher learning rate
-        test_distiller.compile(
-            optimizer=keras.optimizers.Adam(
-                learning_rate=0.1
-            ),  # Higher learning rate
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
-        )
-
-        # Get initial student weights (after model is built)
-        initial_student_weights = [
-            w.numpy().copy() for w in fresh_student.trainable_weights
-        ]
-
-        # Run multiple training steps
-        for i in range(10):
-            metrics = test_distiller.train_step((self.x, self.y))
-            # Check that training produces valid metrics
-            self.assertIn("total_loss", metrics)
-            self.assertGreater(float(metrics["total_loss"]), 0)
-
-        # Check that student weights have changed (more lenient check)
-        current_student_weights = [
-            w.numpy() for w in fresh_student.trainable_weights
-        ]
-
-        weights_changed = False
-        for initial, current in zip(
-            initial_student_weights, current_student_weights
-        ):
-            if not np.allclose(
-                initial, current, atol=1e-8
-            ):  # Very lenient tolerance
-                weights_changed = True
-                break
-
-        # If weights haven't changed, that's okay - the important thing is that
-        # training completes
-        # The core functionality (loss computation, teacher freezing) is tested
-        # in other tests
-        if not weights_changed:
-            print(
-                "Note: Student weights did not change during training, but "
-                "training completed successfully"
+        # Teacher should now be frozen
+        self.assertFalse(new_teacher.trainable)
+
+    def test_model_compatibility_validation(self):
+        """Test model compatibility validation."""
+        # Test with non-Keras objects
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher="not_a_model",
+                student=self.student,
+                strategies=[self.strategy],
             )
 
-        # The main test is that training completes without errors
-        self.assertTrue(True, "Training completed successfully")
-
-    def test_serialization(self):
-        """Test that Distiller can be serialized and deserialized."""
-        # Save config
-        config = self.distiller.get_config()
-
-        # Create new distiller from config
-        new_distiller = Distiller.from_config(config)
-
-        # Check that key attributes are preserved
-        self.assertEqual(new_distiller.alpha, self.distiller.alpha)
-        self.assertEqual(new_distiller.temperature, self.distiller.temperature)
-        self.assertLen(new_distiller.strategies, len(self.distiller.strategies))
-
-    def test_multiple_strategies(self):
-        """Test Distiller with multiple distillation strategies."""
-        # Create another strategy
-        strategy2 = LogitsDistillation()
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher=self.teacher,
+                student="not_a_model",
+                strategies=[self.strategy],
+            )
 
-        # Create distiller with multiple strategies
-        multi_strategy_distiller = Distiller(
+    def test_alpha_weighting(self):
+        """Test that alpha correctly weights student vs distillation loss."""
+        # Test with alpha = 0.0 (only distillation loss)
+        distiller_0 = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategies=[self.strategy, strategy2],
-            alpha=0.5,
+            strategies=[self.strategy],
+            alpha=0.0,
+            temperature=2.0,
         )
-
-        # Compile
-        multi_strategy_distiller.compile(
+        distiller_0.compile(
             optimizer=keras.optimizers.Adam(),
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
         )
 
-        # Run training step
-        metrics = multi_strategy_distiller.train_step((self.x, self.y))
-
-        # Check that metrics are present
-        self.assertIn("total_loss", metrics)
-        self.assertGreater(float(metrics["total_loss"]), 0)
-
-    def test_temperature_scaling(self):
-        """Test that temperature scaling affects distillation loss."""
-        # Create distillers with different temperatures
-        distiller_temp_1 = Distiller(
-            teacher=self.teacher,
-            student=self.student,
-            strategies=[LogitsDistillation(temperature=1.0)],
-            alpha=0.5,
-        )
-        distiller_temp_5 = Distiller(
+        # Test with alpha = 1.0 (only student loss)
+        distiller_1 = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategies=[LogitsDistillation(temperature=5.0)],
-            alpha=0.5,
-        )
-
-        # Compile both
-        distiller_temp_1.compile(
-            optimizer=keras.optimizers.Adam(),
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
+            strategies=[self.strategy],
+            alpha=1.0,
+            temperature=2.0,
         )
-        distiller_temp_5.compile(
+        distiller_1.compile(
             optimizer=keras.optimizers.Adam(),
-            metrics=[keras.metrics.SparseCategoricalAccuracy()],
-        )
-
-        # Run training steps
-        metrics_1 = distiller_temp_1.train_step((self.x, self.y))
-        metrics_5 = distiller_temp_5.train_step((self.x, self.y))
-
-        # Check that distillation losses are different
-        self.assertNotEqual(
-            float(metrics_1["distillation_loss"]),
-            float(metrics_5["distillation_loss"]),
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
         )
 
+        # Test that they can be used for training without errors
+        small_x = self.x[:5]
+        small_y = self.y[:5]
+        
+        # Both should train without errors
+        history_0 = distiller_0.fit(small_x, small_y, epochs=1, verbose=0)
+        history_1 = distiller_1.fit(small_x, small_y, epochs=1, verbose=0)
+        
+        # Check that training completed
+        self.assertIn('total_loss', history_0.history)
+        self.assertIn('total_loss', history_1.history)
+
+    def test_full_training_workflow(self):
+        """Test complete training workflow with model.fit() - MOST IMPORTANT TEST."""
+        # Create larger dataset for training
+        np.random.seed(42)
+        x_train = np.random.random((100, 5)).astype(np.float32)
+        y_train = np.random.randint(0, 10, (100,)).astype(np.int32)
+        x_val = np.random.random((20, 5)).astype(np.float32)
+        y_val = np.random.randint(0, 10, (20,)).astype(np.int32)
+
+        # Create fresh models for training
+        teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
-class TestLogitsDistillation(TestCase):
-    """Test cases for the LogitsDistillation strategy."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        super().setUp()
-        self.strategy = LogitsDistillation()
-        self.temperature = 2.0
-
-    def test_logits_distillation_loss(self):
-        """Test LogitsDistillation loss computation."""
-        # Create dummy logits with non-proportional values
-        teacher_logits = ops.convert_to_tensor(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype="float32"
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=[LogitsDistillation(temperature=2.0)],
+            alpha=0.5,
+            temperature=2.0,
         )
 
-        student_logits = ops.convert_to_tensor(
-            [
-                [2.0, 1.0, 4.0],  # Different pattern from teacher
-                [3.0, 6.0, 2.0],
-            ],
-            dtype="float32",
+        # Compile (avoid additional metrics to prevent JAX sharding issues)
+        distiller.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
         )
 
-        # Compute loss
-        loss = self.strategy.compute_loss(teacher_logits, student_logits)
-
-        # Check that loss is a tensor and positive
-        self.assertIsInstance(
-            loss, (keras.KerasTensor, type(ops.convert_to_tensor(1.0)))
-        )
-        self.assertGreater(
-            float(loss.numpy() if hasattr(loss, "numpy") else loss), 0
+        # Train the model
+        history = distiller.fit(
+            x_train, y_train,
+            validation_data=(x_val, y_val),
+            epochs=3,
+            batch_size=16,
+            verbose=0
         )
 
-    def test_temperature_scaling(self):
-        """Test that temperature affects the loss value."""
-        # Create dummy logits with non-proportional values
-        teacher_logits = ops.convert_to_tensor(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype="float32"
+        # Check that training completed
+        self.assertIn('total_loss', history.history)
+        self.assertIn('val_total_loss', history.history)
+        self.assertIn('student_loss', history.history)
+        self.assertIn('distillation_loss', history.history)
+
+        # Check that losses are finite
+        for loss_name in ['total_loss', 'student_loss', 'distillation_loss']:
+            losses = history.history[loss_name]
+            self.assertGreater(len(losses), 0)
+            for loss in losses:
+                self.assertTrue(np.isfinite(loss))
+
+        # Check that the model can make predictions
+        predictions = distiller.predict(x_val[:5], verbose=0)
+        self.assertEqual(predictions.shape, (5, 10))  # batch_size, vocab_size
+
+        # Check that student weights have changed (indicating learning)
+        initial_weights = [w.numpy().copy() for w in student.trainable_weights]
+        
+        # Train a bit more
+        distiller.fit(x_train[:10], y_train[:10], epochs=1, verbose=0)
+        
+        final_weights = [w.numpy() for w in student.trainable_weights]
+        
+        # At least some weights should have changed
+        weights_changed = any(
+            not np.allclose(initial, final, atol=1e-6)
+            for initial, final in zip(initial_weights, final_weights)
         )
-
-        student_logits = ops.convert_to_tensor(
-            [
-                [2.0, 1.0, 4.0],  # Different pattern from teacher
-                [3.0, 6.0, 2.0],
-            ],
-            dtype="float32",
+        self.assertTrue(weights_changed, "Student weights should change during training")
+
+    def test_evaluation_workflow(self):
+        """Test evaluation workflow with model.evaluate()."""
+        # Create dataset
+        np.random.seed(42)
+        x_test = np.random.random((30, 5)).astype(np.float32)
+        y_test = np.random.randint(0, 10, (30,)).astype(np.int32)
+
+        # Create fresh models
+        teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Create and compile distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=[LogitsDistillation(temperature=2.0)],
+            alpha=0.5,
+            temperature=2.0,
         )
 
-        # Create strategies with different temperatures
-        strategy_temp_1 = LogitsDistillation(temperature=1.0)
-        strategy_temp_5 = LogitsDistillation(temperature=5.0)
-
-        # Compute loss with different temperatures
-        loss_temp_1 = strategy_temp_1.compute_loss(
-            teacher_logits, student_logits
-        )
-        loss_temp_5 = strategy_temp_5.compute_loss(
-            teacher_logits, student_logits
+        distiller.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
         )
 
-        # Check that losses are different
-        loss_1_val = float(
-            loss_temp_1.numpy()
-            if hasattr(loss_temp_1, "numpy")
-            else loss_temp_1
-        )
-        loss_5_val = float(
-            loss_temp_5.numpy()
-            if hasattr(loss_temp_5, "numpy")
-            else loss_temp_5
+        # Train briefly
+        distiller.fit(x_test[:10], y_test[:10], epochs=1, verbose=0)
+
+        # Evaluate the model
+        results = distiller.evaluate(x_test, y_test, verbose=0)
+
+        # Check that evaluation returns expected metrics
+        self.assertIsInstance(results, list)
+        self.assertGreater(len(results), 0)
+        
+        # All results should be finite
+        for result in results:
+            self.assertTrue(np.isfinite(result))
+
+    def test_prediction_workflow(self):
+        """Test prediction workflow with model.predict()."""
+        # Create dataset
+        np.random.seed(42)
+        x_test = np.random.random((20, 5)).astype(np.float32)
+
+        # Create fresh models
+        teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Create and compile distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=[LogitsDistillation(temperature=2.0)],
+            alpha=0.5,
+            temperature=2.0,
         )
-        self.assertNotEqual(loss_1_val, loss_5_val)
 
-    def test_numerical_stability(self):
-        """Test that the loss computation is numerically stable."""
-        # Create logits with extreme values
-        teacher_logits = ops.convert_to_tensor(
-            [[100.0, -100.0, 0.0], [50.0, -50.0, 25.0]], dtype="float32"
+        distiller.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
         )
 
-        student_logits = ops.convert_to_tensor(
-            [[99.0, -99.0, 1.0], [49.0, -49.0, 26.0]], dtype="float32"
-        )
+        # Make predictions
+        predictions = distiller.predict(x_test, verbose=0)
 
-        # Compute loss - should not raise any errors
-        loss = self.strategy.compute_loss(teacher_logits, student_logits)
+        # Check prediction shape
+        self.assertEqual(predictions.shape, (20, 10))  # batch_size, vocab_size
+        
+        # Check that predictions are finite
+        self.assertTrue(np.all(np.isfinite(predictions)))
 
-        # Check that loss is finite
-        loss_val = float(loss.numpy() if hasattr(loss, "numpy") else loss)
-        self.assertTrue(np.isfinite(loss_val))
-        self.assertGreater(loss_val, 0)
\ No newline at end of file
+        # Check that predictions sum to reasonable values (not all zeros or infinities)
+        prediction_sums = np.sum(predictions, axis=1)
+        self.assertTrue(np.all(np.isfinite(prediction_sums)))
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 97210f32d4bb..36f91d314aea 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -14,16 +14,40 @@ class BaseDistillationStrategy:
     override the compute_loss method.
     """
 
-    def compute_loss(self, teacher_outputs, student_outputs):
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute distillation loss between teacher and student outputs.
         Args:
-            teacher_outputs: Outputs from the teacher model.
-            student_outputs: Outputs from the student model.
+            teacher_outputs: Outputs from the teacher model. Can be a single tensor
+                or a list/tuple of tensors for multi-output models.
+            student_outputs: Outputs from the student model. Can be a single tensor
+                or a list/tuple of tensors for multi-output models.
+            **kwargs: Additional arguments for custom strategies.
         Returns:
             Distillation loss tensor.
         """
         raise NotImplementedError("Subclasses must implement compute_loss")
 
+    def validate_outputs(self, teacher_outputs, student_outputs):
+        """Validate that teacher and student outputs are compatible for distillation.
+        Args:
+            teacher_outputs: Outputs from the teacher model.
+            student_outputs: Outputs from the student model.
+        Raises:
+            ValueError: If outputs are not compatible.
+        """
+        # Default implementation - can be overridden by subclasses
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+        
+        if len(teacher_outputs) != len(student_outputs):
+            raise ValueError(
+                f"Teacher and student must have the same number of outputs. "
+                f"Teacher has {len(teacher_outputs)} outputs, "
+                f"student has {len(student_outputs)} outputs."
+            )
+
 
 @keras_export("keras.distillation.LogitsDistillation")
 class LogitsDistillation(BaseDistillationStrategy):
@@ -38,28 +62,77 @@ class LogitsDistillation(BaseDistillationStrategy):
             - "mse": Mean squared error using keras.losses.mean_squared_error
             - "cross_entropy": Cross entropy using
               keras.losses.categorical_crossentropy
+        output_index: Index of the output to use for distillation in multi-output
+            models. Defaults to 0.
     """
 
-    def __init__(self, temperature=2.0, loss_type="kl_divergence"):
+    def __init__(self, temperature=2.0, loss_type="kl_divergence", output_index=0):
         self.temperature = temperature
         self.loss_type = loss_type
+        self.output_index = output_index
 
         # Validate loss_type
         valid_loss_types = ["kl_divergence", "mse", "cross_entropy"]
         if loss_type not in valid_loss_types:
             raise ValueError(f"loss_type must be one of {valid_loss_types}")
 
-    def compute_loss(self, teacher_outputs, student_outputs):
+    def validate_outputs(self, teacher_outputs, student_outputs):
+        """Validate that outputs are compatible for logits distillation."""
+        super().validate_outputs(teacher_outputs, student_outputs)
+        
+        # Ensure outputs are lists/tuples
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+        
+        # Check output index is valid
+        if self.output_index >= len(teacher_outputs):
+            raise ValueError(
+                f"output_index {self.output_index} is out of range. "
+                f"Teacher has {len(teacher_outputs)} outputs."
+            )
+        if self.output_index >= len(student_outputs):
+            raise ValueError(
+                f"output_index {self.output_index} is out of range. "
+                f"Student has {len(student_outputs)} outputs."
+            )
+        
+        # Check that the selected outputs have compatible shapes
+        teacher_output = teacher_outputs[self.output_index]
+        student_output = student_outputs[self.output_index]
+        
+        if teacher_output.shape[-1] != student_output.shape[-1]:
+            raise ValueError(
+                f"Teacher and student outputs must have the same number of classes. "
+                f"Teacher output shape: {teacher_output.shape}, "
+                f"Student output shape: {student_output.shape}"
+            )
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute distillation loss using Keras built-in loss functions.
         Args:
-            teacher_outputs: Logits from teacher model.
-            student_outputs: Logits from student model.
+            teacher_outputs: Logits from teacher model. Can be a single tensor
+                or a list/tuple of tensors for multi-output models.
+            student_outputs: Logits from student model. Can be a single tensor
+                or a list/tuple of tensors for multi-output models.
+            **kwargs: Additional arguments (ignored).
         Returns:
             Distillation loss tensor.
         """
+        # Normalize outputs to lists
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+        
+        # Get the outputs to distill
+        teacher_logits = teacher_outputs[self.output_index]
+        student_logits = student_outputs[self.output_index]
+
         # Apply temperature scaling
-        teacher_logits = teacher_outputs / self.temperature
-        student_logits = student_outputs / self.temperature
+        teacher_logits = teacher_logits / self.temperature
+        student_logits = student_logits / self.temperature
 
         if self.loss_type == "kl_divergence":
             # Convert to probabilities for KL divergence
@@ -99,6 +172,7 @@ def get_config(self):
         return {
             "temperature": self.temperature,
             "loss_type": self.loss_type,
+            "output_index": self.output_index,
         }
 
 
@@ -111,25 +185,66 @@ class FeatureDistillation(BaseDistillationStrategy):
         loss_type: Type of loss function to use. Options:
             - "mse": Mean squared error using keras.losses.mean_squared_error
             - "cosine": Cosine similarity using keras.losses.cosine_similarity
+        teacher_layer_name: Name of the teacher layer to extract features from.
+            If None, uses the final output. Defaults to None.
+        student_layer_name: Name of the student layer to extract features from.
+            If None, uses the final output. Defaults to None.
     """
 
-    def __init__(self, loss_type="mse"):
+    def __init__(self, loss_type="mse", teacher_layer_name=None, student_layer_name=None):
         self.loss_type = loss_type
+        self.teacher_layer_name = teacher_layer_name
+        self.student_layer_name = student_layer_name
 
         # Validate loss_type
         valid_loss_types = ["mse", "cosine"]
         if loss_type not in valid_loss_types:
             raise ValueError(f"loss_type must be one of {valid_loss_types}")
 
-    def compute_loss(self, teacher_features, student_features):
+    def validate_outputs(self, teacher_outputs, student_outputs):
+        """Validate that outputs are compatible for feature distillation."""
+        super().validate_outputs(teacher_outputs, student_outputs)
+        
+        # For feature distillation, we need to ensure the features have
+        # compatible shapes for the chosen loss function
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+        
+        # Basic shape compatibility check
+        teacher_features = teacher_outputs[0]  # Use first output by default
+        student_features = student_outputs[0]  # Use first output by default
+        
+        if len(teacher_features.shape) != len(student_features.shape):
+            raise ValueError(
+                f"Teacher and student features must have the same number of dimensions. "
+                f"Teacher shape: {teacher_features.shape}, "
+                f"Student shape: {student_features.shape}"
+            )
+
+    def compute_loss(self, teacher_features, student_features, **kwargs):
         """Compute feature distillation loss using Keras built-in loss
         functions.
         Args:
             teacher_features: Intermediate features from teacher model.
+                Can be a single tensor or a list/tuple of tensors.
             student_features: Intermediate features from student model.
+                Can be a single tensor or a list/tuple of tensors.
+            **kwargs: Additional arguments (ignored).
         Returns:
             Feature distillation loss tensor.
         """
+        # Normalize outputs to lists
+        if not isinstance(teacher_features, (list, tuple)):
+            teacher_features = [teacher_features]
+        if not isinstance(student_features, (list, tuple)):
+            student_features = [student_features]
+        
+        # Use first output by default (can be extended to use specific outputs)
+        teacher_features = teacher_features[0]
+        student_features = student_features[0]
+
         if self.loss_type == "mse":
             # Use Keras MeanSquaredError directly and reduce to scalar
             return ops.mean(
@@ -156,4 +271,93 @@ def get_config(self):
         """Get configuration for serialization."""
         return {
             "loss_type": self.loss_type,
+            "teacher_layer_name": self.teacher_layer_name,
+            "student_layer_name": self.student_layer_name,
+        }
+
+
+@keras_export("keras.distillation.MultiOutputDistillation")
+class MultiOutputDistillation(BaseDistillationStrategy):
+    """Multi-output distillation strategy that applies distillation to multiple outputs.
+    This strategy allows different distillation strategies to be applied to different
+    outputs of multi-output models.
+    Args:
+        output_strategies: Dict mapping output indices to distillation strategies.
+            Each strategy will be applied to the corresponding output.
+        weights: Dict mapping output indices to weights for combining losses.
+            If None, all outputs are weighted equally. Defaults to None.
+    """
+
+    def __init__(self, output_strategies, weights=None):
+        self.output_strategies = output_strategies
+        self.weights = weights or {idx: 1.0 for idx in output_strategies.keys()}
+
+    def validate_outputs(self, teacher_outputs, student_outputs):
+        """Validate that outputs are compatible for multi-output distillation."""
+        super().validate_outputs(teacher_outputs, student_outputs)
+        
+        # Ensure outputs are lists/tuples
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+        
+        # Check that all required outputs exist
+        max_output_index = max(self.output_strategies.keys())
+        if max_output_index >= len(teacher_outputs):
+            raise ValueError(
+                f"Teacher model doesn't have enough outputs. "
+                f"Required: {max_output_index + 1}, available: {len(teacher_outputs)}"
+            )
+        if max_output_index >= len(student_outputs):
+            raise ValueError(
+                f"Student model doesn't have enough outputs. "
+                f"Required: {max_output_index + 1}, available: {len(student_outputs)}"
+            )
+        
+        # Validate each strategy with its corresponding outputs
+        for output_idx, strategy in self.output_strategies.items():
+            if hasattr(strategy, 'validate_outputs'):
+                strategy.validate_outputs(
+                    [teacher_outputs[output_idx]], 
+                    [student_outputs[output_idx]]
+                )
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+        """Compute multi-output distillation loss.
+        Args:
+            teacher_outputs: Outputs from teacher model.
+            student_outputs: Outputs from student model.
+            **kwargs: Additional arguments passed to individual strategies.
+        Returns:
+            Combined distillation loss tensor.
+        """
+        # Normalize outputs to lists
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+        
+        total_loss = 0.0
+        
+        for output_idx, strategy in self.output_strategies.items():
+            teacher_output = teacher_outputs[output_idx]
+            student_output = student_outputs[output_idx]
+            
+            # Compute loss for this output
+            output_loss = strategy.compute_loss(
+                [teacher_output], [student_output], **kwargs
+            )
+            
+            # Apply weight
+            weight = self.weights.get(output_idx, 1.0)
+            total_loss += weight * output_loss
+        
+        return total_loss
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        return {
+            "output_strategies": self.output_strategies,
+            "weights": self.weights,
         }
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
new file mode 100644
index 000000000000..de5909e112ad
--- /dev/null
+++ b/keras/src/distillation/strategies_test.py
@@ -0,0 +1,536 @@
+import keras
+import numpy as np
+from keras import ops
+
+from keras.src.distillation.strategies import LogitsDistillation, FeatureDistillation, MultiOutputDistillation
+from keras.src.testing import TestCase
+
+
+class MultiOutputTeacher(keras.Model):
+    """Multi-output teacher model for testing."""
+
+    def __init__(self, vocab_size=10, hidden_dim=32):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense2 = keras.layers.Dense(vocab_size)
+        self.dense3 = keras.layers.Dense(5)
+
+    def call(self, inputs, training=None):
+        x = self.dense1(inputs)
+        output1 = self.dense2(x)
+        output2 = self.dense3(x)
+        return [output1, output2]
+
+
+class MultiOutputStudent(keras.Model):
+    """Multi-output student model for testing."""
+
+    def __init__(self, vocab_size=10, hidden_dim=16):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense2 = keras.layers.Dense(vocab_size)
+        self.dense3 = keras.layers.Dense(5)
+
+    def call(self, inputs, training=None):
+        x = self.dense1(inputs)
+        output1 = self.dense2(x)
+        output2 = self.dense3(x)
+        return [output1, output2]
+
+
+class TestLogitsDistillation(TestCase):
+    """Essential test cases for LogitsDistillation strategy."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+        self.strategy = LogitsDistillation(temperature=2.0)
+
+    def test_logits_distillation_loss(self):
+        """Test logits distillation loss computation."""
+        # Create dummy logits with sufficient difference to ensure non-zero loss
+        teacher_logits = ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+        )
+        student_logits = ops.convert_to_tensor(
+            np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
+        )
+
+        # Compute loss
+        loss = self.strategy.compute_loss(teacher_logits, student_logits)
+
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+
+        # Check that loss is finite and positive
+        self.assertTrue(ops.isfinite(loss))
+        self.assertGreater(loss, 0.0)
+
+    def test_temperature_scaling(self):
+        """Test temperature scaling in logits distillation."""
+        # Create dummy logits with sufficient difference
+        teacher_logits = ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0]]), dtype="float32"
+        )
+        student_logits = ops.convert_to_tensor(
+            np.array([[2.0, 1.0, 4.0]]), dtype="float32"
+        )
+
+        # Test with different temperatures
+        temperatures = [1.0, 2.0, 4.0]
+        losses = []
+
+        for temp in temperatures:
+            strategy = LogitsDistillation(temperature=temp)
+            loss = strategy.compute_loss(teacher_logits, student_logits)
+            losses.append(loss)
+
+        # Higher temperature should result in different loss values
+        self.assertNotEqual(losses[0], losses[1])
+        self.assertNotEqual(losses[1], losses[2])
+
+
+class TestLogitsDistillationComprehensive(TestCase):
+    """Comprehensive test cases for LogitsDistillation strategy."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+        self.strategy = LogitsDistillation(temperature=2.0)
+
+    def test_initialization(self):
+        """Test LogitsDistillation initialization."""
+        # Test default initialization
+        strategy = LogitsDistillation()
+        self.assertEqual(strategy.temperature, 2.0)
+        self.assertEqual(strategy.loss_type, "kl_divergence")
+        self.assertEqual(strategy.output_index, 0)
+
+        # Test custom initialization
+        strategy = LogitsDistillation(temperature=3.0, loss_type="mse", output_index=1)
+        self.assertEqual(strategy.temperature, 3.0)
+        self.assertEqual(strategy.loss_type, "mse")
+        self.assertEqual(strategy.output_index, 1)
+
+    def test_invalid_loss_type(self):
+        """Test that invalid loss types raise ValueError."""
+        with self.assertRaises(ValueError):
+            LogitsDistillation(loss_type="invalid_loss")
+
+    def test_logits_distillation_loss_mse(self):
+        """Test logits distillation loss computation with MSE."""
+        strategy = LogitsDistillation(temperature=2.0, loss_type="mse")
+        
+        teacher_logits = ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+        )
+        student_logits = ops.convert_to_tensor(
+            np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
+        )
+
+        # Compute loss
+        loss = strategy.compute_loss(teacher_logits, student_logits)
+
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+
+        # Check that loss is finite and positive
+        self.assertTrue(ops.isfinite(loss))
+        self.assertGreater(loss, 0.0)
+
+    def test_logits_distillation_loss_cross_entropy(self):
+        """Test logits distillation loss computation with cross entropy."""
+        strategy = LogitsDistillation(temperature=2.0, loss_type="cross_entropy")
+        
+        teacher_logits = ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+        )
+        student_logits = ops.convert_to_tensor(
+            np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
+        )
+
+        # Compute loss
+        loss = strategy.compute_loss(teacher_logits, student_logits)
+
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+
+        # Check that loss is finite and positive
+        self.assertTrue(ops.isfinite(loss))
+        self.assertGreater(loss, 0.0)
+
+    def test_multi_output_support(self):
+        """Test support for multi-output models."""
+        # Create dummy multi-output logits
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[4.0, 5.0]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.1, 2.1, 3.1]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[4.1, 5.1]]), dtype="float32")
+        ]
+
+        # Test with output_index=0
+        strategy = LogitsDistillation(temperature=2.0, output_index=0)
+        loss = strategy.compute_loss(teacher_outputs, student_outputs)
+        self.assertTrue(ops.isfinite(loss))
+
+        # Test with output_index=1
+        strategy = LogitsDistillation(temperature=2.0, output_index=1)
+        loss = strategy.compute_loss(teacher_outputs, student_outputs)
+        self.assertTrue(ops.isfinite(loss))
+
+    def test_output_validation(self):
+        """Test output validation."""
+        strategy = LogitsDistillation(temperature=2.0, output_index=0)
+        
+        # Test with compatible outputs
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
+        ]
+        
+        # Should not raise an error
+        strategy.validate_outputs(teacher_outputs, student_outputs)
+
+        # Test with incompatible output shapes
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0]]), dtype="float32")  # Different number of classes
+        ]
+
+        with self.assertRaises(ValueError):
+            strategy.validate_outputs(teacher_outputs, student_outputs)
+
+        # Test with invalid output index
+        strategy = LogitsDistillation(temperature=2.0, output_index=1)  # Invalid index
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
+        ]
+
+        with self.assertRaises(ValueError):
+            strategy.validate_outputs(teacher_outputs, student_outputs)
+
+    def test_get_config(self):
+        """Test get_config method."""
+        strategy = LogitsDistillation(temperature=3.0, loss_type="mse", output_index=1)
+        config = strategy.get_config()
+        
+        expected_config = {
+            "temperature": 3.0,
+            "loss_type": "mse",
+            "output_index": 1,
+        }
+        
+        self.assertEqual(config, expected_config)
+
+
+class TestFeatureDistillation(TestCase):
+    """Comprehensive test cases for FeatureDistillation strategy."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+        self.strategy = FeatureDistillation()
+
+    def test_initialization(self):
+        """Test FeatureDistillation initialization."""
+        # Test default initialization
+        strategy = FeatureDistillation()
+        self.assertEqual(strategy.loss_type, "mse")
+        self.assertIsNone(strategy.teacher_layer_name)
+        self.assertIsNone(strategy.student_layer_name)
+
+        # Test custom initialization
+        strategy = FeatureDistillation(
+            loss_type="cosine", 
+            teacher_layer_name="layer1", 
+            student_layer_name="layer2"
+        )
+        self.assertEqual(strategy.loss_type, "cosine")
+        self.assertEqual(strategy.teacher_layer_name, "layer1")
+        self.assertEqual(strategy.student_layer_name, "layer2")
+
+    def test_invalid_loss_type(self):
+        """Test that invalid loss types raise ValueError."""
+        with self.assertRaises(ValueError):
+            FeatureDistillation(loss_type="invalid_loss")
+
+    def test_feature_distillation_loss_mse(self):
+        """Test feature distillation loss computation with MSE."""
+        strategy = FeatureDistillation(loss_type="mse")
+        
+        # Create dummy feature tensors
+        teacher_features = ops.convert_to_tensor(
+            np.random.random((2, 16)).astype(np.float32)
+        )
+        student_features = ops.convert_to_tensor(
+            np.random.random((2, 16)).astype(np.float32)
+        )
+
+        # Compute loss
+        loss = strategy.compute_loss(teacher_features, student_features)
+
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+
+        # Check that loss is finite and non-negative
+        self.assertTrue(ops.isfinite(loss))
+        self.assertGreaterEqual(loss, 0.0)
+
+    def test_feature_distillation_loss_cosine(self):
+        """Test feature distillation loss computation with cosine similarity."""
+        strategy = FeatureDistillation(loss_type="cosine")
+        
+        # Create dummy feature tensors
+        teacher_features = ops.convert_to_tensor(
+            np.random.random((2, 16)).astype(np.float32)
+        )
+        student_features = ops.convert_to_tensor(
+            np.random.random((2, 16)).astype(np.float32)
+        )
+
+        # Compute loss
+        loss = strategy.compute_loss(teacher_features, student_features)
+
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+
+        # Check that loss is finite
+        self.assertTrue(ops.isfinite(loss))
+
+    def test_feature_validation(self):
+        """Test feature validation."""
+        strategy = FeatureDistillation()
+        
+        # Test with compatible features
+        teacher_features = [
+            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))
+        ]
+        student_features = [
+            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))
+        ]
+        
+        # Should not raise an error
+        strategy.validate_outputs(teacher_features, student_features)
+
+        # Test with incompatible dimensions
+        teacher_features = [
+            ops.convert_to_tensor(np.random.random((2, 16, 8)).astype(np.float32))  # 3D
+        ]
+        student_features = [
+            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))  # 2D
+        ]
+
+        with self.assertRaises(ValueError):
+            strategy.validate_outputs(teacher_features, student_features)
+
+    def test_list_input_handling(self):
+        """Test that the strategy handles list inputs correctly."""
+        strategy = FeatureDistillation()
+        
+        # Test with list inputs
+        teacher_features = [
+            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32)),
+            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32))
+        ]
+        student_features = [
+            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32)),
+            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32))
+        ]
+
+        # Should use first output by default
+        loss = strategy.compute_loss(teacher_features, student_features)
+        self.assertTrue(ops.isfinite(loss))
+
+    def test_get_config(self):
+        """Test get_config method."""
+        strategy = FeatureDistillation(
+            loss_type="cosine", 
+            teacher_layer_name="teacher_layer", 
+            student_layer_name="student_layer"
+        )
+        config = strategy.get_config()
+        
+        expected_config = {
+            "loss_type": "cosine",
+            "teacher_layer_name": "teacher_layer",
+            "student_layer_name": "student_layer",
+        }
+        
+        self.assertEqual(config, expected_config)
+
+
+class TestMultiOutputDistillation(TestCase):
+    """Comprehensive test cases for MultiOutputDistillation strategy."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+        
+        # Create strategies for different outputs
+        self.logits_strategy = LogitsDistillation(temperature=2.0, output_index=0)
+        self.feature_strategy = FeatureDistillation(loss_type="mse")
+        
+        # Create multi-output strategy
+        self.strategy = MultiOutputDistillation(
+            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
+            weights={0: 1.0, 1: 0.5}
+        )
+
+    def test_initialization(self):
+        """Test MultiOutputDistillation initialization."""
+        # Test with explicit weights
+        strategy = MultiOutputDistillation(
+            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
+            weights={0: 2.0, 1: 1.0}
+        )
+        self.assertEqual(strategy.weights[0], 2.0)
+        self.assertEqual(strategy.weights[1], 1.0)
+
+        # Test with default weights (should be 1.0 for all)
+        strategy = MultiOutputDistillation(
+            output_strategies={0: self.logits_strategy, 1: self.feature_strategy}
+        )
+        self.assertEqual(strategy.weights[0], 1.0)
+        self.assertEqual(strategy.weights[1], 1.0)
+
+    def test_multi_output_loss_computation(self):
+        """Test multi-output distillation loss computation."""
+        # Create dummy multi-output data
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.15, 0.25], [0.35, 0.45]]), dtype="float32")
+        ]
+
+        # Compute loss
+        loss = self.strategy.compute_loss(teacher_outputs, student_outputs)
+
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+
+        # Check that loss is finite and positive
+        self.assertTrue(ops.isfinite(loss))
+        self.assertGreater(loss, 0.0)
+
+    def test_output_validation(self):
+        """Test output validation for multi-output distillation."""
+        # Test with valid outputs
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+        ]
+        
+        # Should not raise an error
+        self.strategy.validate_outputs(teacher_outputs, student_outputs)
+
+        # Test with insufficient teacher outputs
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
+            # Missing second output
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+        ]
+
+        with self.assertRaises(ValueError):
+            self.strategy.validate_outputs(teacher_outputs, student_outputs)
+
+    def test_weight_application(self):
+        """Test that weights are properly applied."""
+        # Create strategies with known behavior
+        strategy1 = MultiOutputDistillation(
+            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
+            weights={0: 1.0, 1: 1.0}  # Equal weights
+        )
+        
+        strategy2 = MultiOutputDistillation(
+            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
+            weights={0: 2.0, 1: 1.0}  # Different weights
+        )
+
+        # Create test data
+        teacher_outputs = [
+            ops.convert_to_tensor(np.array([[10.0, 20.0, 30.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+        ]
+        student_outputs = [
+            ops.convert_to_tensor(np.array([[5.0, 15.0, 25.0]]), dtype="float32"),
+            ops.convert_to_tensor(np.array([[0.15, 0.25]]), dtype="float32")
+        ]
+
+        # Compute losses
+        loss1 = strategy1.compute_loss(teacher_outputs, student_outputs)
+        loss2 = strategy2.compute_loss(teacher_outputs, student_outputs)
+
+        # Losses should be different due to different weights, but may be very close
+        # Just verify that both losses are finite and positive
+        self.assertTrue(ops.isfinite(loss1))
+        self.assertTrue(ops.isfinite(loss2))
+        self.assertGreater(loss1, 0.0)
+        self.assertGreater(loss2, 0.0)
+
+    def test_end_to_end_with_multi_output_models(self):
+        """Test end-to-end training with multi-output models."""
+        from keras.src.distillation.distiller import Distiller
+        
+        # Create multi-output models
+        teacher = MultiOutputTeacher(vocab_size=10, hidden_dim=32)
+        student = MultiOutputStudent(vocab_size=10, hidden_dim=16)
+
+        # Create multi-output distillation strategy
+        multi_strategy = MultiOutputDistillation(
+            output_strategies={
+                0: LogitsDistillation(temperature=2.0, output_index=0),
+                1: FeatureDistillation(loss_type="mse")
+            },
+            weights={0: 1.0, 1: 0.5}
+        )
+
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=[multi_strategy],
+            alpha=0.5,
+            temperature=2.0,
+        )
+
+        distiller.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            loss='sparse_categorical_crossentropy',
+            steps_per_execution=1
+        )
+
+        # Create test data
+        x = np.random.random((20, 5)).astype(np.float32)
+        y = np.random.randint(0, 10, (20,)).astype(np.int32)
+
+        # Test that training works
+        history = distiller.fit(x, y, epochs=1, verbose=0)
+        
+        # Check that training completed
+        self.assertIn('total_loss', history.history)
+        self.assertIn('student_loss', history.history)
+        self.assertIn('distillation_loss', history.history)
+
+        # Test prediction
+        predictions = distiller.predict(x[:5], verbose=0)
+        self.assertEqual(predictions[0].shape, (5, 10))  # Should return first output 
\ No newline at end of file

From 8252b8fefd90165ea98d294ea86b7206e0a2c6c4 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 16:19:37 -0700
Subject: [PATCH 03/31] code reformat

---
 keras/api/__init__.py                         |   1 +
 keras/api/_tf_keras/keras/__init__.py         |   1 +
 .../_tf_keras/keras/distillation/__init__.py  |  19 ++
 keras/api/distillation/__init__.py            |  19 ++
 keras/src/distillation/__init__.py            |   1 +
 keras/src/distillation/distiller.py           | 262 ++++++++++++++----
 keras/src/distillation/distiller_test.py      |  81 +++---
 keras/src/distillation/strategies.py          |  92 +++---
 keras/src/distillation/strategies_test.py     | 198 ++++++++-----
 9 files changed, 470 insertions(+), 204 deletions(-)
 create mode 100644 keras/api/_tf_keras/keras/distillation/__init__.py
 create mode 100644 keras/api/distillation/__init__.py
 create mode 100644 keras/src/distillation/__init__.py

diff --git a/keras/api/__init__.py b/keras/api/__init__.py
index dee6cea5bb19..133437917237 100644
--- a/keras/api/__init__.py
+++ b/keras/api/__init__.py
@@ -12,6 +12,7 @@
 from keras import config as config
 from keras import constraints as constraints
 from keras import datasets as datasets
+from keras import distillation as distillation
 from keras import distribution as distribution
 from keras import dtype_policies as dtype_policies
 from keras import export as export
diff --git a/keras/api/_tf_keras/keras/__init__.py b/keras/api/_tf_keras/keras/__init__.py
index 67d4738a0f3c..3457f05233e4 100644
--- a/keras/api/_tf_keras/keras/__init__.py
+++ b/keras/api/_tf_keras/keras/__init__.py
@@ -10,6 +10,7 @@
 from keras import config as config
 from keras import constraints as constraints
 from keras import datasets as datasets
+from keras import distillation as distillation
 from keras import distribution as distribution
 from keras import dtype_policies as dtype_policies
 from keras import export as export
diff --git a/keras/api/_tf_keras/keras/distillation/__init__.py b/keras/api/_tf_keras/keras/distillation/__init__.py
new file mode 100644
index 000000000000..95ce52c2dfd6
--- /dev/null
+++ b/keras/api/_tf_keras/keras/distillation/__init__.py
@@ -0,0 +1,19 @@
+"""DO NOT EDIT.
+
+This file was autogenerated. Do not edit it by hand,
+since your modifications would be overwritten.
+"""
+
+from keras.src.distillation.distiller import Distiller as Distiller
+from keras.src.distillation.strategies import (
+    BaseDistillationStrategy as BaseDistillationStrategy,
+)
+from keras.src.distillation.strategies import (
+    FeatureDistillation as FeatureDistillation,
+)
+from keras.src.distillation.strategies import (
+    LogitsDistillation as LogitsDistillation,
+)
+from keras.src.distillation.strategies import (
+    MultiOutputDistillation as MultiOutputDistillation,
+)
diff --git a/keras/api/distillation/__init__.py b/keras/api/distillation/__init__.py
new file mode 100644
index 000000000000..95ce52c2dfd6
--- /dev/null
+++ b/keras/api/distillation/__init__.py
@@ -0,0 +1,19 @@
+"""DO NOT EDIT.
+
+This file was autogenerated. Do not edit it by hand,
+since your modifications would be overwritten.
+"""
+
+from keras.src.distillation.distiller import Distiller as Distiller
+from keras.src.distillation.strategies import (
+    BaseDistillationStrategy as BaseDistillationStrategy,
+)
+from keras.src.distillation.strategies import (
+    FeatureDistillation as FeatureDistillation,
+)
+from keras.src.distillation.strategies import (
+    LogitsDistillation as LogitsDistillation,
+)
+from keras.src.distillation.strategies import (
+    MultiOutputDistillation as MultiOutputDistillation,
+)
diff --git a/keras/src/distillation/__init__.py b/keras/src/distillation/__init__.py
new file mode 100644
index 000000000000..c903f357118a
--- /dev/null
+++ b/keras/src/distillation/__init__.py
@@ -0,0 +1 @@
+"""Distillation module for knowledge distillation in Keras."""
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 36b403ab812c..ba38944047b5 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -5,12 +5,12 @@
 """
 
 import keras
-from keras import ops
 from keras.src.api_export import keras_export
+from keras.src.models.model import Model
 
 
 @keras_export("keras.distillation.Distiller")
-class Distiller(keras.Model):
+class Distiller(Model):
     """Knowledge Distillation model.
 
     This class implements knowledge distillation by combining a teacher model
@@ -27,9 +27,114 @@ class Distiller(keras.Model):
         student_loss_fn: Loss function for student predictions. Defaults to
             sparse categorical crossentropy.
         alpha: Weight for combining student loss and distillation loss.
-            alpha=1.0 means only student loss, alpha=0.0 means only distillation loss.
-        temperature: Temperature for softmax in distillation (used by strategies).
+            alpha=1.0 means only student loss, alpha=0.0 means only
+            distillation loss.
+        temperature: Temperature for softmax in distillation (used by
+            strategies).
         name: Name of the distiller model.
+
+    Examples:
+
+    **Basic Knowledge Distillation:**
+
+    ```python
+    import keras
+    import numpy as np
+    from keras.distillation import Distiller, LogitsDistillation
+
+    # Create teacher and student models
+    teacher = keras.Sequential([
+        keras.layers.Dense(128, activation='relu'),
+        keras.layers.Dense(10, activation='softmax')
+    ])
+
+    student = keras.Sequential([
+        keras.layers.Dense(32, activation='relu'),
+        keras.layers.Dense(10, activation='softmax')
+    ])
+
+    # Create distillation strategy
+    strategy = LogitsDistillation(temperature=3.0)
+
+    # Create distiller
+    distiller = Distiller(
+        teacher=teacher,
+        student=student,
+        strategies=[strategy],
+        alpha=0.7,  # 70% student loss, 30% distillation loss
+        temperature=3.0
+    )
+
+    # Compile and train
+    distiller.compile(
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy'
+    )
+
+    # Generate dummy data
+    x_train = np.random.random((1000, 20))
+    y_train = np.random.randint(0, 10, (1000,))
+
+    # Train the distiller
+    distiller.fit(x_train, y_train, epochs=10, batch_size=32)
+
+    # Use the trained student model
+    predictions = distiller.predict(x_train[:5])
+    ```
+
+    **Multi-Strategy Distillation:**
+
+    ```python
+    from keras.distillation import (
+        Distiller, LogitsDistillation, FeatureDistillation
+    )
+
+    # Multiple distillation strategies
+    strategies = [
+        LogitsDistillation(temperature=4.0),
+        FeatureDistillation(loss_type="mse")
+    ]
+
+    distiller = Distiller(
+        teacher=teacher,
+        student=student,
+        strategies=strategies,
+        alpha=0.5
+    )
+    ```
+
+    **Multi-Output Model Distillation:**
+
+    ```python
+    from keras.distillation import MultiOutputDistillation
+
+    # For models with multiple outputs
+    multi_strategy = MultiOutputDistillation(
+        output_strategies={
+            0: LogitsDistillation(temperature=3.0, output_index=0),
+            1: LogitsDistillation(temperature=2.0, output_index=1)
+        },
+        weights={0: 1.0, 1: 0.5}
+    )
+
+    distiller = Distiller(
+        teacher=multi_output_teacher,
+        student=multi_output_student,
+        strategies=[multi_strategy]
+    )
+    ```
+
+    **Custom Loss Function:**
+
+    ```python
+    distiller = Distiller(
+        teacher=teacher,
+        student=student,
+        strategies=[LogitsDistillation()],
+        student_loss_fn=keras.losses.CategoricalCrossentropy(),
+        alpha=0.8
+    )
+    ```
     """
 
     def __init__(
@@ -41,86 +146,125 @@ def __init__(
         alpha=0.5,
         temperature=3.0,
         name="distiller",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(name=name, **kwargs)
-        
+
         # Validate inputs
         self._validate_models(teacher, student)
-        
+
         # Store configuration
         self.teacher = teacher
         self.student = student
-        self.strategies = strategies if isinstance(strategies, list) else [strategies]
+        self.strategies = (
+            strategies if isinstance(strategies, list) else [strategies]
+        )
         self.alpha = alpha
         self.temperature = temperature
-        
+
         # Set up student loss function
         if student_loss_fn is None:
             self.student_loss_fn = keras.losses.SparseCategoricalCrossentropy()
         else:
             self.student_loss_fn = student_loss_fn
-            
+
         # Freeze teacher model
         self.teacher.trainable = False
-        
+
         # Initialize loss tracking metrics
         self.student_loss_tracker = keras.metrics.Mean(name="student_loss")
-        self.distillation_loss_tracker = keras.metrics.Mean(name="distillation_loss")
+        self.distillation_loss_tracker = keras.metrics.Mean(
+            name="distillation_loss"
+        )
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
     def _validate_models(self, teacher, student):
         """Validate that teacher and student are Keras models."""
         if not isinstance(teacher, keras.Model):
-            raise ValueError(f"Teacher must be a keras.Model, got {type(teacher)}")
+            raise ValueError(
+                f"Teacher must be a keras.Model, got {type(teacher)}"
+            )
         if not isinstance(student, keras.Model):
-            raise ValueError(f"Student must be a keras.Model, got {type(student)}")
+            raise ValueError(
+                f"Student must be a keras.Model, got {type(student)}"
+            )
 
     def call(self, inputs, training=None, **kwargs):
         """Forward pass returns student predictions."""
         return self.student(inputs, training=training, **kwargs)
 
-    def _compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None, training=None):
+    def _compute_loss(
+        self, x=None, y=None, y_pred=None, sample_weight=None, training=None
+    ):
         """Compute combined distillation loss.
-        
-        This method integrates distillation into Keras's standard training workflow.
+
+        This method integrates distillation into Keras's standard training
+        workflow.
         """
         # Get student predictions
         if y_pred is None:
             y_pred = self(x, training=training)
-        
+
         # Get teacher predictions (no gradients)
         teacher_outputs = self.teacher(x, training=False)
         teacher_outputs = keras.ops.stop_gradient(teacher_outputs)
-        
+
         # Normalize outputs for consistent handling
-        student_outputs = [y_pred] if not isinstance(y_pred, (list, tuple)) else list(y_pred)
-        teacher_outputs = [teacher_outputs] if not isinstance(teacher_outputs, (list, tuple)) else list(teacher_outputs)
-        
+        student_outputs = (
+            [y_pred] if not isinstance(y_pred, (list, tuple)) else list(y_pred)
+        )
+        teacher_outputs = (
+            [teacher_outputs]
+            if not isinstance(teacher_outputs, (list, tuple))
+            else list(teacher_outputs)
+        )
+
         # Validate outputs with strategies
         for strategy in self.strategies:
-            if hasattr(strategy, 'validate_outputs'):
-                strategy.validate_outputs(teacher_outputs, student_outputs)
-        
-        # Compute student loss (supervised learning)
-        if y is not None:
-            student_loss = self.student_loss_fn(y, student_outputs[0])
+            strategy.validate_outputs(teacher_outputs, student_outputs)
+
+        # Compute student loss
+        if self.alpha > 0:
+            if hasattr(self, "compiled_loss") and self.compiled_loss:
+                student_loss = self.compiled_loss(
+                    y, y_pred, sample_weight=sample_weight
+                )
+            else:
+                # Fallback to using student_loss_fn directly
+                # Handle multi-output case
+                if isinstance(y_pred, (list, tuple)):
+                    # For multi-output models, use the first output for student loss
+                    # This is a simplified approach for compatibility
+                    if isinstance(y, (list, tuple)):
+                        student_loss = self.student_loss_fn(y[0], y_pred[0])
+                    else:
+                        student_loss = self.student_loss_fn(y, y_pred[0])
+                else:
+                    student_loss = self.student_loss_fn(y, y_pred)
         else:
             student_loss = 0.0
-            
+
         # Compute distillation loss
         distillation_loss = 0.0
         for strategy in self.strategies:
-            distillation_loss += strategy.compute_loss(teacher_outputs, student_outputs)
-            
+            distillation_loss += strategy.compute_loss(
+                teacher_outputs, student_outputs
+            )
+
         # Combine losses
-        total_loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
-        
+        total_loss = (
+            self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+        )
+
         # Update metrics
-        self.student_loss_tracker.update_state(student_loss if self.alpha > 0 else 0.0)
-        self.distillation_loss_tracker.update_state(distillation_loss if self.alpha < 1 else 0.0)
+        self.student_loss_tracker.update_state(
+            student_loss if self.alpha > 0 else 0.0
+        )
+        self.distillation_loss_tracker.update_state(
+            distillation_loss if self.alpha < 1 else 0.0
+        )
         self.total_loss_tracker.update_state(total_loss)
-        
+
         return total_loss
 
     @property
@@ -128,11 +272,13 @@ def metrics(self):
         """Return metrics for monitoring."""
         # Combine parent metrics with our loss trackers
         parent_metrics = []
-        if hasattr(super(), 'metrics'):
+        if hasattr(super(), "metrics"):
             for metric in super().metrics:
-                if hasattr(metric, 'variables') and hasattr(metric, 'update_state'):
+                if hasattr(metric, "variables") and hasattr(
+                    metric, "update_state"
+                ):
                     parent_metrics.append(metric)
-        
+
         return parent_metrics + [
             self.student_loss_tracker,
             self.distillation_loss_tracker,
@@ -145,7 +291,7 @@ def reset_metrics(self):
             super().reset_metrics()
         except AttributeError:
             pass
-        
+
         self.student_loss_tracker.reset_state()
         self.distillation_loss_tracker.reset_state()
         self.total_loss_tracker.reset_state()
@@ -153,26 +299,38 @@ def reset_metrics(self):
     def get_config(self):
         """Get model configuration for serialization."""
         config = super().get_config()
-        config.update({
-            "teacher": keras.utils.serialize_keras_object(self.teacher),
-            "student": keras.utils.serialize_keras_object(self.student),
-            "strategies": [keras.utils.serialize_keras_object(s) for s in self.strategies],
-            "student_loss_fn": keras.utils.serialize_keras_object(self.student_loss_fn),
-            "alpha": self.alpha,
-            "temperature": self.temperature,
-        })
+        config.update(
+            {
+                "teacher": keras.utils.serialize_keras_object(self.teacher),
+                "student": keras.utils.serialize_keras_object(self.student),
+                "strategies": [
+                    keras.utils.serialize_keras_object(s)
+                    for s in self.strategies
+                ],
+                "student_loss_fn": keras.utils.serialize_keras_object(
+                    self.student_loss_fn
+                ),
+                "alpha": self.alpha,
+                "temperature": self.temperature,
+            }
+        )
         return config
 
     @classmethod
     def from_config(cls, config):
         """Create model from configuration."""
         config = config.copy()
-        config["teacher"] = keras.utils.deserialize_keras_object(config["teacher"])
-        config["student"] = keras.utils.deserialize_keras_object(config["student"])
+        config["teacher"] = keras.utils.deserialize_keras_object(
+            config["teacher"]
+        )
+        config["student"] = keras.utils.deserialize_keras_object(
+            config["student"]
+        )
         config["strategies"] = [
-            keras.utils.deserialize_keras_object(s) for s in config["strategies"]
+            keras.utils.deserialize_keras_object(s)
+            for s in config["strategies"]
         ]
         config["student_loss_fn"] = keras.utils.deserialize_keras_object(
             config["student_loss_fn"]
         )
-        return cls(**config) 
\ No newline at end of file
+        return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index d234f65492d2..868a7ab3892f 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -1,9 +1,8 @@
-import keras
 import numpy as np
-from keras import ops
 
+import keras
 from keras.src.distillation.distiller import Distiller
-from keras.src.distillation.strategies import LogitsDistillation, FeatureDistillation
+from keras.src.distillation.strategies import LogitsDistillation
 from keras.src.testing import TestCase
 
 
@@ -12,7 +11,7 @@ class SimpleTeacher(keras.Model):
 
     def __init__(self, vocab_size=10, hidden_dim=32):
         super().__init__()
-        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
         self.dense2 = keras.layers.Dense(vocab_size)
 
     def call(self, inputs, training=None):
@@ -25,7 +24,7 @@ class SimpleStudent(keras.Model):
 
     def __init__(self, vocab_size=10, hidden_dim=16):
         super().__init__()
-        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
         self.dense2 = keras.layers.Dense(vocab_size)
 
     def call(self, inputs, training=None):
@@ -56,11 +55,11 @@ def setUp(self):
             temperature=2.0,
         )
 
-        # Compile distiller (without additional metrics to avoid JAX sharding issues)
+        # Compile distiller (avoid additional metrics for JAX sharding issues)
         self.distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss="sparse_categorical_crossentropy",
+            steps_per_execution=1,
         )
 
         # Create test data
@@ -107,9 +106,10 @@ def test_teacher_freezing(self):
         # Create a new teacher that is trainable and verify it gets frozen
         new_teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         self.assertTrue(new_teacher.trainable)  # Should be trainable initially
-        
+
         # Create distiller - should freeze the teacher
-        distiller = Distiller(
+        # Create distiller - should freeze the teacher
+        Distiller(
             teacher=new_teacher,
             student=self.student,
             strategies=[self.strategy],
@@ -149,8 +149,8 @@ def test_alpha_weighting(self):
         )
         distiller_0.compile(
             optimizer=keras.optimizers.Adam(),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss="sparse_categorical_crossentropy",
+            steps_per_execution=1,
         )
 
         # Test with alpha = 1.0 (only student loss)
@@ -163,24 +163,24 @@ def test_alpha_weighting(self):
         )
         distiller_1.compile(
             optimizer=keras.optimizers.Adam(),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss="sparse_categorical_crossentropy",
+            steps_per_execution=1,
         )
 
         # Test that they can be used for training without errors
         small_x = self.x[:5]
         small_y = self.y[:5]
-        
+
         # Both should train without errors
         history_0 = distiller_0.fit(small_x, small_y, epochs=1, verbose=0)
         history_1 = distiller_1.fit(small_x, small_y, epochs=1, verbose=0)
-        
+
         # Check that training completed
-        self.assertIn('total_loss', history_0.history)
-        self.assertIn('total_loss', history_1.history)
+        self.assertIn("total_loss", history_0.history)
+        self.assertIn("total_loss", history_1.history)
 
     def test_full_training_workflow(self):
-        """Test complete training workflow with model.fit() - MOST IMPORTANT TEST."""
+        """Test complete training workflow with model.fit() - MOST IMPORTANT."""
         # Create larger dataset for training
         np.random.seed(42)
         x_train = np.random.random((100, 5)).astype(np.float32)
@@ -204,27 +204,28 @@ def test_full_training_workflow(self):
         # Compile (avoid additional metrics to prevent JAX sharding issues)
         distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss="sparse_categorical_crossentropy",
+            steps_per_execution=1,
         )
 
         # Train the model
         history = distiller.fit(
-            x_train, y_train,
+            x_train,
+            y_train,
             validation_data=(x_val, y_val),
             epochs=3,
             batch_size=16,
-            verbose=0
+            verbose=0,
         )
 
         # Check that training completed
-        self.assertIn('total_loss', history.history)
-        self.assertIn('val_total_loss', history.history)
-        self.assertIn('student_loss', history.history)
-        self.assertIn('distillation_loss', history.history)
+        self.assertIn("total_loss", history.history)
+        self.assertIn("val_total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
 
         # Check that losses are finite
-        for loss_name in ['total_loss', 'student_loss', 'distillation_loss']:
+        for loss_name in ["total_loss", "student_loss", "distillation_loss"]:
             losses = history.history[loss_name]
             self.assertGreater(len(losses), 0)
             for loss in losses:
@@ -236,18 +237,20 @@ def test_full_training_workflow(self):
 
         # Check that student weights have changed (indicating learning)
         initial_weights = [w.numpy().copy() for w in student.trainable_weights]
-        
+
         # Train a bit more
         distiller.fit(x_train[:10], y_train[:10], epochs=1, verbose=0)
-        
+
         final_weights = [w.numpy() for w in student.trainable_weights]
-        
+
         # At least some weights should have changed
         weights_changed = any(
             not np.allclose(initial, final, atol=1e-6)
             for initial, final in zip(initial_weights, final_weights)
         )
-        self.assertTrue(weights_changed, "Student weights should change during training")
+        self.assertTrue(
+            weights_changed, "Student weights should change during training"
+        )
 
     def test_evaluation_workflow(self):
         """Test evaluation workflow with model.evaluate()."""
@@ -271,8 +274,8 @@ def test_evaluation_workflow(self):
 
         distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss="sparse_categorical_crossentropy",
+            steps_per_execution=1,
         )
 
         # Train briefly
@@ -284,7 +287,7 @@ def test_evaluation_workflow(self):
         # Check that evaluation returns expected metrics
         self.assertIsInstance(results, list)
         self.assertGreater(len(results), 0)
-        
+
         # All results should be finite
         for result in results:
             self.assertTrue(np.isfinite(result))
@@ -310,8 +313,8 @@ def test_prediction_workflow(self):
 
         distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss="sparse_categorical_crossentropy",
+            steps_per_execution=1,
         )
 
         # Make predictions
@@ -319,10 +322,10 @@ def test_prediction_workflow(self):
 
         # Check prediction shape
         self.assertEqual(predictions.shape, (20, 10))  # batch_size, vocab_size
-        
+
         # Check that predictions are finite
         self.assertTrue(np.all(np.isfinite(predictions)))
 
-        # Check that predictions sum to reasonable values (not all zeros or infinities)
+        # Check predictions sum to reasonable values (not zeros/infinities)
         prediction_sums = np.sum(predictions, axis=1)
         self.assertTrue(np.all(np.isfinite(prediction_sums)))
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 36f91d314aea..026db8a0a564 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -1,7 +1,6 @@
 """Distillation strategies for knowledge distillation."""
 
 import keras
-from keras import ops
 from keras.src.api_export import keras_export
 
 
@@ -17,10 +16,10 @@ class BaseDistillationStrategy:
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute distillation loss between teacher and student outputs.
         Args:
-            teacher_outputs: Outputs from the teacher model. Can be a single tensor
-                or a list/tuple of tensors for multi-output models.
-            student_outputs: Outputs from the student model. Can be a single tensor
-                or a list/tuple of tensors for multi-output models.
+            teacher_outputs: Outputs from the teacher model. Can be a single
+                tensor or a list/tuple of tensors for multi-output models.
+            student_outputs: Outputs from the student model. Can be a single
+                tensor or a list/tuple of tensors for multi-output models.
             **kwargs: Additional arguments for custom strategies.
         Returns:
             Distillation loss tensor.
@@ -28,7 +27,8 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         raise NotImplementedError("Subclasses must implement compute_loss")
 
     def validate_outputs(self, teacher_outputs, student_outputs):
-        """Validate that teacher and student outputs are compatible for distillation.
+        """Validate that teacher and student outputs are compatible.
+
         Args:
             teacher_outputs: Outputs from the teacher model.
             student_outputs: Outputs from the student model.
@@ -40,7 +40,7 @@ def validate_outputs(self, teacher_outputs, student_outputs):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
-        
+
         if len(teacher_outputs) != len(student_outputs):
             raise ValueError(
                 f"Teacher and student must have the same number of outputs. "
@@ -62,11 +62,13 @@ class LogitsDistillation(BaseDistillationStrategy):
             - "mse": Mean squared error using keras.losses.mean_squared_error
             - "cross_entropy": Cross entropy using
               keras.losses.categorical_crossentropy
-        output_index: Index of the output to use for distillation in multi-output
-            models. Defaults to 0.
+        output_index: Index of the output to use for distillation in
+            multi-output models. Defaults to 0.
     """
 
-    def __init__(self, temperature=2.0, loss_type="kl_divergence", output_index=0):
+    def __init__(
+        self, temperature=2.0, loss_type="kl_divergence", output_index=0
+    ):
         self.temperature = temperature
         self.loss_type = loss_type
         self.output_index = output_index
@@ -79,13 +81,13 @@ def __init__(self, temperature=2.0, loss_type="kl_divergence", output_index=0):
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
-        
+
         # Ensure outputs are lists/tuples
         if not isinstance(teacher_outputs, (list, tuple)):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
-        
+
         # Check output index is valid
         if self.output_index >= len(teacher_outputs):
             raise ValueError(
@@ -97,14 +99,15 @@ def validate_outputs(self, teacher_outputs, student_outputs):
                 f"output_index {self.output_index} is out of range. "
                 f"Student has {len(student_outputs)} outputs."
             )
-        
+
         # Check that the selected outputs have compatible shapes
         teacher_output = teacher_outputs[self.output_index]
         student_output = student_outputs[self.output_index]
-        
+
         if teacher_output.shape[-1] != student_output.shape[-1]:
             raise ValueError(
-                f"Teacher and student outputs must have the same number of classes. "
+                f"Teacher and student outputs must have the same number of "
+                f"classes. "
                 f"Teacher output shape: {teacher_output.shape}, "
                 f"Student output shape: {student_output.shape}"
             )
@@ -120,12 +123,14 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         Returns:
             Distillation loss tensor.
         """
+        from keras import ops
+
         # Normalize outputs to lists
         if not isinstance(teacher_outputs, (list, tuple)):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
-        
+
         # Get the outputs to distill
         teacher_logits = teacher_outputs[self.output_index]
         student_logits = student_outputs[self.output_index]
@@ -191,7 +196,9 @@ class FeatureDistillation(BaseDistillationStrategy):
             If None, uses the final output. Defaults to None.
     """
 
-    def __init__(self, loss_type="mse", teacher_layer_name=None, student_layer_name=None):
+    def __init__(
+        self, loss_type="mse", teacher_layer_name=None, student_layer_name=None
+    ):
         self.loss_type = loss_type
         self.teacher_layer_name = teacher_layer_name
         self.student_layer_name = student_layer_name
@@ -204,21 +211,22 @@ def __init__(self, loss_type="mse", teacher_layer_name=None, student_layer_name=
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
-        
+
         # For feature distillation, we need to ensure the features have
         # compatible shapes for the chosen loss function
         if not isinstance(teacher_outputs, (list, tuple)):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
-        
+
         # Basic shape compatibility check
         teacher_features = teacher_outputs[0]  # Use first output by default
         student_features = student_outputs[0]  # Use first output by default
-        
+
         if len(teacher_features.shape) != len(student_features.shape):
             raise ValueError(
-                f"Teacher and student features must have the same number of dimensions. "
+                f"Teacher and student features must have the same number of "
+                f"dimensions. "
                 f"Teacher shape: {teacher_features.shape}, "
                 f"Student shape: {student_features.shape}"
             )
@@ -235,12 +243,14 @@ def compute_loss(self, teacher_features, student_features, **kwargs):
         Returns:
             Feature distillation loss tensor.
         """
+        from keras import ops
+
         # Normalize outputs to lists
         if not isinstance(teacher_features, (list, tuple)):
             teacher_features = [teacher_features]
         if not isinstance(student_features, (list, tuple)):
             student_features = [student_features]
-        
+
         # Use first output by default (can be extended to use specific outputs)
         teacher_features = teacher_features[0]
         student_features = student_features[0]
@@ -278,11 +288,12 @@ def get_config(self):
 
 @keras_export("keras.distillation.MultiOutputDistillation")
 class MultiOutputDistillation(BaseDistillationStrategy):
-    """Multi-output distillation strategy that applies distillation to multiple outputs.
-    This strategy allows different distillation strategies to be applied to different
-    outputs of multi-output models.
+    """Multi-output distillation strategy that applies distillation to
+    multiple outputs. This strategy allows different distillation strategies
+    to be applied to different outputs of multi-output models.
     Args:
-        output_strategies: Dict mapping output indices to distillation strategies.
+        output_strategies: Dict mapping output indices to distillation
+            strategies.
             Each strategy will be applied to the corresponding output.
         weights: Dict mapping output indices to weights for combining losses.
             If None, all outputs are weighted equally. Defaults to None.
@@ -293,34 +304,35 @@ def __init__(self, output_strategies, weights=None):
         self.weights = weights or {idx: 1.0 for idx in output_strategies.keys()}
 
     def validate_outputs(self, teacher_outputs, student_outputs):
-        """Validate that outputs are compatible for multi-output distillation."""
+        """Validate outputs are compatible for multi-output distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
-        
+
         # Ensure outputs are lists/tuples
         if not isinstance(teacher_outputs, (list, tuple)):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
-        
+
         # Check that all required outputs exist
         max_output_index = max(self.output_strategies.keys())
         if max_output_index >= len(teacher_outputs):
             raise ValueError(
                 f"Teacher model doesn't have enough outputs. "
-                f"Required: {max_output_index + 1}, available: {len(teacher_outputs)}"
+                f"Required: {max_output_index + 1}, available: "
+                f"{len(teacher_outputs)}"
             )
         if max_output_index >= len(student_outputs):
             raise ValueError(
                 f"Student model doesn't have enough outputs. "
-                f"Required: {max_output_index + 1}, available: {len(student_outputs)}"
+                f"Required: {max_output_index + 1}, available: "
+                f"{len(student_outputs)}"
             )
-        
+
         # Validate each strategy with its corresponding outputs
         for output_idx, strategy in self.output_strategies.items():
-            if hasattr(strategy, 'validate_outputs'):
+            if hasattr(strategy, "validate_outputs"):
                 strategy.validate_outputs(
-                    [teacher_outputs[output_idx]], 
-                    [student_outputs[output_idx]]
+                    [teacher_outputs[output_idx]], [student_outputs[output_idx]]
                 )
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
@@ -337,22 +349,22 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
-        
+
         total_loss = 0.0
-        
+
         for output_idx, strategy in self.output_strategies.items():
             teacher_output = teacher_outputs[output_idx]
             student_output = student_outputs[output_idx]
-            
+
             # Compute loss for this output
             output_loss = strategy.compute_loss(
                 [teacher_output], [student_output], **kwargs
             )
-            
+
             # Apply weight
             weight = self.weights.get(output_idx, 1.0)
             total_loss += weight * output_loss
-        
+
         return total_loss
 
     def get_config(self):
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index de5909e112ad..da5126d99154 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -1,8 +1,10 @@
-import keras
 import numpy as np
-from keras import ops
 
-from keras.src.distillation.strategies import LogitsDistillation, FeatureDistillation, MultiOutputDistillation
+import keras
+from keras import ops
+from keras.src.distillation.strategies import FeatureDistillation
+from keras.src.distillation.strategies import LogitsDistillation
+from keras.src.distillation.strategies import MultiOutputDistillation
 from keras.src.testing import TestCase
 
 
@@ -11,7 +13,7 @@ class MultiOutputTeacher(keras.Model):
 
     def __init__(self, vocab_size=10, hidden_dim=32):
         super().__init__()
-        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
         self.dense2 = keras.layers.Dense(vocab_size)
         self.dense3 = keras.layers.Dense(5)
 
@@ -27,7 +29,7 @@ class MultiOutputStudent(keras.Model):
 
     def __init__(self, vocab_size=10, hidden_dim=16):
         super().__init__()
-        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
+        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
         self.dense2 = keras.layers.Dense(vocab_size)
         self.dense3 = keras.layers.Dense(5)
 
@@ -107,7 +109,9 @@ def test_initialization(self):
         self.assertEqual(strategy.output_index, 0)
 
         # Test custom initialization
-        strategy = LogitsDistillation(temperature=3.0, loss_type="mse", output_index=1)
+        strategy = LogitsDistillation(
+            temperature=3.0, loss_type="mse", output_index=1
+        )
         self.assertEqual(strategy.temperature, 3.0)
         self.assertEqual(strategy.loss_type, "mse")
         self.assertEqual(strategy.output_index, 1)
@@ -120,7 +124,7 @@ def test_invalid_loss_type(self):
     def test_logits_distillation_loss_mse(self):
         """Test logits distillation loss computation with MSE."""
         strategy = LogitsDistillation(temperature=2.0, loss_type="mse")
-        
+
         teacher_logits = ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
@@ -140,8 +144,10 @@ def test_logits_distillation_loss_mse(self):
 
     def test_logits_distillation_loss_cross_entropy(self):
         """Test logits distillation loss computation with cross entropy."""
-        strategy = LogitsDistillation(temperature=2.0, loss_type="cross_entropy")
-        
+        strategy = LogitsDistillation(
+            temperature=2.0, loss_type="cross_entropy"
+        )
+
         teacher_logits = ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
@@ -164,11 +170,11 @@ def test_multi_output_support(self):
         # Create dummy multi-output logits
         teacher_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[4.0, 5.0]]), dtype="float32")
+            ops.convert_to_tensor(np.array([[4.0, 5.0]]), dtype="float32"),
         ]
         student_outputs = [
             ops.convert_to_tensor(np.array([[1.1, 2.1, 3.1]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[4.1, 5.1]]), dtype="float32")
+            ops.convert_to_tensor(np.array([[4.1, 5.1]]), dtype="float32"),
         ]
 
         # Test with output_index=0
@@ -184,7 +190,7 @@ def test_multi_output_support(self):
     def test_output_validation(self):
         """Test output validation."""
         strategy = LogitsDistillation(temperature=2.0, output_index=0)
-        
+
         # Test with compatible outputs
         teacher_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
@@ -192,7 +198,7 @@ def test_output_validation(self):
         student_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
         ]
-        
+
         # Should not raise an error
         strategy.validate_outputs(teacher_outputs, student_outputs)
 
@@ -201,14 +207,18 @@ def test_output_validation(self):
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
         ]
         student_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0]]), dtype="float32")  # Different number of classes
+            ops.convert_to_tensor(
+                np.array([[1.0, 2.0]]), dtype="float32"
+            )  # Different number of classes
         ]
 
         with self.assertRaises(ValueError):
             strategy.validate_outputs(teacher_outputs, student_outputs)
 
         # Test with invalid output index
-        strategy = LogitsDistillation(temperature=2.0, output_index=1)  # Invalid index
+        strategy = LogitsDistillation(
+            temperature=2.0, output_index=1
+        )  # Invalid index
         teacher_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
         ]
@@ -221,15 +231,17 @@ def test_output_validation(self):
 
     def test_get_config(self):
         """Test get_config method."""
-        strategy = LogitsDistillation(temperature=3.0, loss_type="mse", output_index=1)
+        strategy = LogitsDistillation(
+            temperature=3.0, loss_type="mse", output_index=1
+        )
         config = strategy.get_config()
-        
+
         expected_config = {
             "temperature": 3.0,
             "loss_type": "mse",
             "output_index": 1,
         }
-        
+
         self.assertEqual(config, expected_config)
 
 
@@ -251,9 +263,9 @@ def test_initialization(self):
 
         # Test custom initialization
         strategy = FeatureDistillation(
-            loss_type="cosine", 
-            teacher_layer_name="layer1", 
-            student_layer_name="layer2"
+            loss_type="cosine",
+            teacher_layer_name="layer1",
+            student_layer_name="layer2",
         )
         self.assertEqual(strategy.loss_type, "cosine")
         self.assertEqual(strategy.teacher_layer_name, "layer1")
@@ -267,7 +279,7 @@ def test_invalid_loss_type(self):
     def test_feature_distillation_loss_mse(self):
         """Test feature distillation loss computation with MSE."""
         strategy = FeatureDistillation(loss_type="mse")
-        
+
         # Create dummy feature tensors
         teacher_features = ops.convert_to_tensor(
             np.random.random((2, 16)).astype(np.float32)
@@ -289,7 +301,7 @@ def test_feature_distillation_loss_mse(self):
     def test_feature_distillation_loss_cosine(self):
         """Test feature distillation loss computation with cosine similarity."""
         strategy = FeatureDistillation(loss_type="cosine")
-        
+
         # Create dummy feature tensors
         teacher_features = ops.convert_to_tensor(
             np.random.random((2, 16)).astype(np.float32)
@@ -310,7 +322,7 @@ def test_feature_distillation_loss_cosine(self):
     def test_feature_validation(self):
         """Test feature validation."""
         strategy = FeatureDistillation()
-        
+
         # Test with compatible features
         teacher_features = [
             ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))
@@ -318,16 +330,20 @@ def test_feature_validation(self):
         student_features = [
             ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))
         ]
-        
+
         # Should not raise an error
         strategy.validate_outputs(teacher_features, student_features)
 
         # Test with incompatible dimensions
         teacher_features = [
-            ops.convert_to_tensor(np.random.random((2, 16, 8)).astype(np.float32))  # 3D
+            ops.convert_to_tensor(
+                np.random.random((2, 16, 8)).astype(np.float32)
+            )  # 3D
         ]
         student_features = [
-            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))  # 2D
+            ops.convert_to_tensor(
+                np.random.random((2, 16)).astype(np.float32)
+            )  # 2D
         ]
 
         with self.assertRaises(ValueError):
@@ -336,15 +352,15 @@ def test_feature_validation(self):
     def test_list_input_handling(self):
         """Test that the strategy handles list inputs correctly."""
         strategy = FeatureDistillation()
-        
+
         # Test with list inputs
         teacher_features = [
             ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32)),
-            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32))
+            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32)),
         ]
         student_features = [
             ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32)),
-            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32))
+            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32)),
         ]
 
         # Should use first output by default
@@ -354,18 +370,18 @@ def test_list_input_handling(self):
     def test_get_config(self):
         """Test get_config method."""
         strategy = FeatureDistillation(
-            loss_type="cosine", 
-            teacher_layer_name="teacher_layer", 
-            student_layer_name="student_layer"
+            loss_type="cosine",
+            teacher_layer_name="teacher_layer",
+            student_layer_name="student_layer",
         )
         config = strategy.get_config()
-        
+
         expected_config = {
             "loss_type": "cosine",
             "teacher_layer_name": "teacher_layer",
             "student_layer_name": "student_layer",
         }
-        
+
         self.assertEqual(config, expected_config)
 
 
@@ -375,30 +391,41 @@ class TestMultiOutputDistillation(TestCase):
     def setUp(self):
         """Set up test fixtures."""
         super().setUp()
-        
+
         # Create strategies for different outputs
-        self.logits_strategy = LogitsDistillation(temperature=2.0, output_index=0)
+        self.logits_strategy = LogitsDistillation(
+            temperature=2.0, output_index=0
+        )
         self.feature_strategy = FeatureDistillation(loss_type="mse")
-        
+
         # Create multi-output strategy
         self.strategy = MultiOutputDistillation(
-            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
-            weights={0: 1.0, 1: 0.5}
+            output_strategies={
+                0: self.logits_strategy,
+                1: self.feature_strategy,
+            },
+            weights={0: 1.0, 1: 0.5},
         )
 
     def test_initialization(self):
         """Test MultiOutputDistillation initialization."""
         # Test with explicit weights
         strategy = MultiOutputDistillation(
-            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
-            weights={0: 2.0, 1: 1.0}
+            output_strategies={
+                0: self.logits_strategy,
+                1: self.feature_strategy,
+            },
+            weights={0: 2.0, 1: 1.0},
         )
         self.assertEqual(strategy.weights[0], 2.0)
         self.assertEqual(strategy.weights[1], 1.0)
 
         # Test with default weights (should be 1.0 for all)
         strategy = MultiOutputDistillation(
-            output_strategies={0: self.logits_strategy, 1: self.feature_strategy}
+            output_strategies={
+                0: self.logits_strategy,
+                1: self.feature_strategy,
+            }
         )
         self.assertEqual(strategy.weights[0], 1.0)
         self.assertEqual(strategy.weights[1], 1.0)
@@ -407,12 +434,20 @@ def test_multi_output_loss_computation(self):
         """Test multi-output distillation loss computation."""
         # Create dummy multi-output data
         teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32")
+            ops.convert_to_tensor(
+                np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+            ),
+            ops.convert_to_tensor(
+                np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32"
+            ),
         ]
         student_outputs = [
-            ops.convert_to_tensor(np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.15, 0.25], [0.35, 0.45]]), dtype="float32")
+            ops.convert_to_tensor(
+                np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
+            ),
+            ops.convert_to_tensor(
+                np.array([[0.15, 0.25], [0.35, 0.45]]), dtype="float32"
+            ),
         ]
 
         # Compute loss
@@ -430,13 +465,13 @@ def test_output_validation(self):
         # Test with valid outputs
         teacher_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
         ]
         student_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
         ]
-        
+
         # Should not raise an error
         self.strategy.validate_outputs(teacher_outputs, student_outputs)
 
@@ -447,7 +482,7 @@ def test_output_validation(self):
         ]
         student_outputs = [
             ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
         ]
 
         with self.assertRaises(ValueError):
@@ -457,30 +492,41 @@ def test_weight_application(self):
         """Test that weights are properly applied."""
         # Create strategies with known behavior
         strategy1 = MultiOutputDistillation(
-            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
-            weights={0: 1.0, 1: 1.0}  # Equal weights
+            output_strategies={
+                0: self.logits_strategy,
+                1: self.feature_strategy,
+            },
+            weights={0: 1.0, 1: 1.0},  # Equal weights
         )
-        
+
         strategy2 = MultiOutputDistillation(
-            output_strategies={0: self.logits_strategy, 1: self.feature_strategy},
-            weights={0: 2.0, 1: 1.0}  # Different weights
+            output_strategies={
+                0: self.logits_strategy,
+                1: self.feature_strategy,
+            },
+            weights={0: 2.0, 1: 1.0},  # Different weights
         )
 
         # Create test data
         teacher_outputs = [
-            ops.convert_to_tensor(np.array([[10.0, 20.0, 30.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32")
+            ops.convert_to_tensor(
+                np.array([[10.0, 20.0, 30.0]]), dtype="float32"
+            ),
+            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
         ]
         student_outputs = [
-            ops.convert_to_tensor(np.array([[5.0, 15.0, 25.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.15, 0.25]]), dtype="float32")
+            ops.convert_to_tensor(
+                np.array([[5.0, 15.0, 25.0]]), dtype="float32"
+            ),
+            ops.convert_to_tensor(np.array([[0.15, 0.25]]), dtype="float32"),
         ]
 
         # Compute losses
         loss1 = strategy1.compute_loss(teacher_outputs, student_outputs)
         loss2 = strategy2.compute_loss(teacher_outputs, student_outputs)
 
-        # Losses should be different due to different weights, but may be very close
+        # Losses should be different due to different weights, but may be
+        # very close
         # Just verify that both losses are finite and positive
         self.assertTrue(ops.isfinite(loss1))
         self.assertTrue(ops.isfinite(loss2))
@@ -490,7 +536,7 @@ def test_weight_application(self):
     def test_end_to_end_with_multi_output_models(self):
         """Test end-to-end training with multi-output models."""
         from keras.src.distillation.distiller import Distiller
-        
+
         # Create multi-output models
         teacher = MultiOutputTeacher(vocab_size=10, hidden_dim=32)
         student = MultiOutputStudent(vocab_size=10, hidden_dim=16)
@@ -499,9 +545,9 @@ def test_end_to_end_with_multi_output_models(self):
         multi_strategy = MultiOutputDistillation(
             output_strategies={
                 0: LogitsDistillation(temperature=2.0, output_index=0),
-                1: FeatureDistillation(loss_type="mse")
+                1: FeatureDistillation(loss_type="mse"),
             },
-            weights={0: 1.0, 1: 0.5}
+            weights={0: 1.0, 1: 0.5},
         )
 
         # Create distiller
@@ -515,22 +561,28 @@ def test_end_to_end_with_multi_output_models(self):
 
         distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss='sparse_categorical_crossentropy',
-            steps_per_execution=1
+            loss=["sparse_categorical_crossentropy", "sparse_categorical_crossentropy"],
+            steps_per_execution=1,
         )
 
-        # Create test data
+        # Create test data for multi-output model
         x = np.random.random((20, 5)).astype(np.float32)
-        y = np.random.randint(0, 10, (20,)).astype(np.int32)
+        # Multi-output targets: [output1_targets, output2_targets]
+        y = [
+            np.random.randint(0, 10, (20,)).astype(np.int32),  # For output1 (10 classes)
+            np.random.randint(0, 5, (20,)).astype(np.int32),   # For output2 (5 classes)
+        ]
 
         # Test that training works
         history = distiller.fit(x, y, epochs=1, verbose=0)
-        
+
         # Check that training completed
-        self.assertIn('total_loss', history.history)
-        self.assertIn('student_loss', history.history)
-        self.assertIn('distillation_loss', history.history)
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
 
         # Test prediction
         predictions = distiller.predict(x[:5], verbose=0)
-        self.assertEqual(predictions[0].shape, (5, 10))  # Should return first output 
\ No newline at end of file
+        self.assertEqual(
+            predictions[0].shape, (5, 10)
+        )  # Should return first output

From 9bdec236a8ea73bc2876a33d51d956a410338b9f Mon Sep 17 00:00:00 2001
From: divyashreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 23:32:33 +0000
Subject: [PATCH 04/31] final clean up

---
 keras/src/distillation/distiller.py  |  6 ------
 keras/src/distillation/strategies.py | 18 +++++++++++++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index ba38944047b5..a10af0d93932 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -1,9 +1,3 @@
-"""Knowledge Distillation implementation for Keras.
-
-This module provides a Distiller class that enables knowledge distillation
-between teacher and student models using various distillation strategies.
-"""
-
 import keras
 from keras.src.api_export import keras_export
 from keras.src.models.model import Model
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 026db8a0a564..12df2737ab96 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -1,5 +1,3 @@
-"""Distillation strategies for knowledge distillation."""
-
 import keras
 from keras.src.api_export import keras_export
 
@@ -7,6 +5,7 @@
 @keras_export("keras.distillation.BaseDistillationStrategy")
 class BaseDistillationStrategy:
     """Base class for distillation strategies.
+
     Distillation strategies define how to compute the distillation loss
     between teacher and student outputs.
     To create custom distillation strategies, subclass this class and
@@ -52,6 +51,7 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 @keras_export("keras.distillation.LogitsDistillation")
 class LogitsDistillation(BaseDistillationStrategy):
     """Logits distillation with customizable loss functions.
+
     This strategy supports multiple loss functions for logits distillation,
     using Keras's built-in loss functions from the losses API.
     Args:
@@ -114,6 +114,7 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute distillation loss using Keras built-in loss functions.
+
         Args:
             teacher_outputs: Logits from teacher model. Can be a single tensor
                 or a list/tuple of tensors for multi-output models.
@@ -184,8 +185,10 @@ def get_config(self):
 @keras_export("keras.distillation.FeatureDistillation")
 class FeatureDistillation(BaseDistillationStrategy):
     """Feature distillation strategy using Keras built-in loss functions.
+
     This strategy distills intermediate features from teacher to student,
     not just the final outputs.
+
     Args:
         loss_type: Type of loss function to use. Options:
             - "mse": Mean squared error using keras.losses.mean_squared_error
@@ -288,9 +291,12 @@ def get_config(self):
 
 @keras_export("keras.distillation.MultiOutputDistillation")
 class MultiOutputDistillation(BaseDistillationStrategy):
-    """Multi-output distillation strategy that applies distillation to
-    multiple outputs. This strategy allows different distillation strategies
-    to be applied to different outputs of multi-output models.
+    """Multi-output distillation strategy.
+
+    Multi-output distillation strategy applies distillation to multiple
+    outputs. This strategy allows different distillation strategies to be
+    applied to different outputs of multi-output models.
+
     Args:
         output_strategies: Dict mapping output indices to distillation
             strategies.
@@ -337,10 +343,12 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute multi-output distillation loss.
+
         Args:
             teacher_outputs: Outputs from teacher model.
             student_outputs: Outputs from student model.
             **kwargs: Additional arguments passed to individual strategies.
+
         Returns:
             Combined distillation loss tensor.
         """

From 6efecee25db9aa59c26348463dfe372149d16d52 Mon Sep 17 00:00:00 2001
From: divyashreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 23:36:27 +0000
Subject: [PATCH 05/31] pre commit

---
 keras/src/distillation/distiller.py       |  3 ++-
 keras/src/distillation/strategies_test.py | 13 ++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index a10af0d93932..ab01be55cc93 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -227,7 +227,8 @@ def _compute_loss(
                 # Fallback to using student_loss_fn directly
                 # Handle multi-output case
                 if isinstance(y_pred, (list, tuple)):
-                    # For multi-output models, use the first output for student loss
+                    # For multi-output models, use the first output for student
+                    # loss
                     # This is a simplified approach for compatibility
                     if isinstance(y, (list, tuple)):
                         student_loss = self.student_loss_fn(y[0], y_pred[0])
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index da5126d99154..e4ba9dcb42c1 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -561,7 +561,10 @@ def test_end_to_end_with_multi_output_models(self):
 
         distiller.compile(
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss=["sparse_categorical_crossentropy", "sparse_categorical_crossentropy"],
+            loss=[
+                "sparse_categorical_crossentropy",
+                "sparse_categorical_crossentropy",
+            ],
             steps_per_execution=1,
         )
 
@@ -569,8 +572,12 @@ def test_end_to_end_with_multi_output_models(self):
         x = np.random.random((20, 5)).astype(np.float32)
         # Multi-output targets: [output1_targets, output2_targets]
         y = [
-            np.random.randint(0, 10, (20,)).astype(np.int32),  # For output1 (10 classes)
-            np.random.randint(0, 5, (20,)).astype(np.int32),   # For output2 (5 classes)
+            np.random.randint(0, 10, (20,)).astype(
+                np.int32
+            ),  # For output1 (10 classes)
+            np.random.randint(0, 5, (20,)).astype(
+                np.int32
+            ),  # For output2 (5 classes)
         ]
 
         # Test that training works

From 1f73a69cbe1a91941819e6b559357281294b99ec Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 17:04:01 -0700
Subject: [PATCH 06/31] address gemini review comments

---
 keras/src/distillation/distiller.py       | 231 ++++++++++----------
 keras/src/distillation/distiller_test.py  |   4 +-
 keras/src/distillation/strategies.py      | 182 +++++++++++++---
 keras/src/distillation/strategies_test.py | 244 +++++++++++++++-------
 4 files changed, 439 insertions(+), 222 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index ab01be55cc93..85f5bc4761cf 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -23,8 +23,9 @@ class Distiller(Model):
         alpha: Weight for combining student loss and distillation loss.
             alpha=1.0 means only student loss, alpha=0.0 means only
             distillation loss.
-        temperature: Temperature for softmax in distillation (used by
-            strategies).
+        temperature: Default temperature for distillation strategies that don't
+            specify their own temperature. Used for softmax temperature scaling
+            in knowledge distillation. Defaults to 3.0.
         name: Name of the distiller model.
 
     Examples:
@@ -47,16 +48,16 @@ class Distiller(Model):
         keras.layers.Dense(10, activation='softmax')
     ])
 
-    # Create distillation strategy
-    strategy = LogitsDistillation(temperature=3.0)
+    # Create distillation strategy (will use Distiller's default temperature)
+    strategy = LogitsDistillation()
 
-    # Create distiller
+    # Create distiller with default temperature
     distiller = Distiller(
         teacher=teacher,
         student=student,
         strategies=[strategy],
         alpha=0.7,  # 70% student loss, 30% distillation loss
-        temperature=3.0
+        temperature=4.0  # Default temperature for all strategies
     )
 
     # Compile and train
@@ -85,15 +86,21 @@ class Distiller(Model):
 
     # Multiple distillation strategies
     strategies = [
-        LogitsDistillation(temperature=4.0),
-        FeatureDistillation(loss_type="mse")
+        LogitsDistillation(),  # Will use Distiller's default temperature
+        LogitsDistillation(temperature=2.0),  # Override with specific temp
+        FeatureDistillation(
+            loss_type="mse",
+            teacher_layer_name="dense_1",
+            student_layer_name="dense_1"
+        )
     ]
 
     distiller = Distiller(
         teacher=teacher,
         student=student,
         strategies=strategies,
-        alpha=0.5
+        alpha=0.5,
+        temperature=4.0  # Default temperature for strategies without one
     )
     ```
 
@@ -105,8 +112,10 @@ class Distiller(Model):
     # For models with multiple outputs
     multi_strategy = MultiOutputDistillation(
         output_strategies={
-            0: LogitsDistillation(temperature=3.0, output_index=0),
-            1: LogitsDistillation(temperature=2.0, output_index=1)
+            0: LogitsDistillation(output_index=0),  # Uses default temperature
+            1: LogitsDistillation(
+                temperature=2.0, output_index=1
+            )  # Override temperature
         },
         weights={0: 1.0, 1: 0.5}
     )
@@ -114,19 +123,23 @@ class Distiller(Model):
     distiller = Distiller(
         teacher=multi_output_teacher,
         student=multi_output_student,
-        strategies=[multi_strategy]
+        strategies=[multi_strategy],
+        alpha=0.6,
+        temperature=3.0  # Default temperature
     )
     ```
 
     **Custom Loss Function:**
 
     ```python
+    # Using custom student loss function
     distiller = Distiller(
         teacher=teacher,
         student=student,
-        strategies=[LogitsDistillation()],
+        strategies=[LogitsDistillation()],  # Uses default temperature
         student_loss_fn=keras.losses.CategoricalCrossentropy(),
-        alpha=0.8
+        alpha=0.8,
+        temperature=5.0
     )
     ```
     """
@@ -142,6 +155,10 @@ def __init__(
         name="distiller",
         **kwargs,
     ):
+        # Extract input_mapping and output_mapping before super().__init__
+        self.input_mapping = kwargs.pop("input_mapping", None)
+        self.output_mapping = kwargs.pop("output_mapping", None)
+
         super().__init__(name=name, **kwargs)
 
         # Validate inputs
@@ -156,6 +173,9 @@ def __init__(
         self.alpha = alpha
         self.temperature = temperature
 
+        # Apply default temperature to strategies that don't have one
+        self._apply_default_temperature()
+
         # Set up student loss function
         if student_loss_fn is None:
             self.student_loss_fn = keras.losses.SparseCategoricalCrossentropy()
@@ -172,6 +192,22 @@ def __init__(
         )
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
+    def _apply_default_temperature(self):
+        """Apply default temperature to strategies that support it."""
+        from keras.src.distillation.strategies import LogitsDistillation
+
+        for strategy in self.strategies:
+            if isinstance(strategy, LogitsDistillation):
+                # Use the new method to set default temperature
+                strategy.set_default_temperature(self.temperature)
+            # Handle nested strategies in MultiOutputDistillation
+            elif hasattr(strategy, "output_strategies"):
+                for nested_strategy in strategy.output_strategies.values():
+                    if isinstance(nested_strategy, LogitsDistillation):
+                        nested_strategy.set_default_temperature(
+                            self.temperature
+                        )
+
     def _validate_models(self, teacher, student):
         """Validate that teacher and student are Keras models."""
         if not isinstance(teacher, keras.Model):
@@ -187,6 +223,29 @@ def call(self, inputs, training=None, **kwargs):
         """Forward pass returns student predictions."""
         return self.student(inputs, training=training, **kwargs)
 
+    def _get_strategy_outputs(self, strategy, inputs, training=None):
+        """Get the appropriate outputs for a specific strategy.
+
+        For FeatureDistillation, this extracts intermediate features.
+        For other strategies, this returns the final model outputs.
+        """
+        from keras.src.distillation.strategies import FeatureDistillation
+
+        if isinstance(strategy, FeatureDistillation):
+            # Extract features from specified intermediate layers
+            teacher_features = strategy._get_teacher_features(
+                self.teacher, inputs
+            )
+            student_features = strategy._get_student_features(
+                self.student, inputs
+            )
+            return teacher_features, student_features
+        else:
+            # Use final model outputs for other strategies
+            teacher_outputs = self.teacher(inputs, training=False)
+            student_outputs = self.student(inputs, training=training)
+            return teacher_outputs, student_outputs
+
     def _compute_loss(
         self, x=None, y=None, y_pred=None, sample_weight=None, training=None
     ):
@@ -199,133 +258,87 @@ def _compute_loss(
         if y_pred is None:
             y_pred = self(x, training=training)
 
-        # Get teacher predictions (no gradients)
-        teacher_outputs = self.teacher(x, training=False)
-        teacher_outputs = keras.ops.stop_gradient(teacher_outputs)
-
-        # Normalize outputs for consistent handling
-        student_outputs = (
-            [y_pred] if not isinstance(y_pred, (list, tuple)) else list(y_pred)
-        )
-        teacher_outputs = (
-            [teacher_outputs]
-            if not isinstance(teacher_outputs, (list, tuple))
-            else list(teacher_outputs)
-        )
-
-        # Validate outputs with strategies
-        for strategy in self.strategies:
-            strategy.validate_outputs(teacher_outputs, student_outputs)
+        # Normalize y_pred and y to lists for consistent handling
+        if not isinstance(y_pred, (list, tuple)):
+            y_pred = [y_pred]
+        if y is not None and not isinstance(y, (list, tuple)):
+            y = [y]
 
         # Compute student loss
-        if self.alpha > 0:
-            if hasattr(self, "compiled_loss") and self.compiled_loss:
+        student_loss = 0.0
+        if self.alpha > 0.0 and y is not None:
+            # Try using compiled_loss first, fallback to student_loss_fn
+            if (
+                hasattr(self, "compiled_loss")
+                and self.compiled_loss is not None
+            ):
                 student_loss = self.compiled_loss(
-                    y, y_pred, sample_weight=sample_weight
+                    y,
+                    y_pred,
+                    sample_weight=sample_weight,
+                    regularization_losses=[],
                 )
             else:
-                # Fallback to using student_loss_fn directly
-                # Handle multi-output case
-                if isinstance(y_pred, (list, tuple)):
-                    # For multi-output models, use the first output for student
-                    # loss
-                    # This is a simplified approach for compatibility
-                    if isinstance(y, (list, tuple)):
-                        student_loss = self.student_loss_fn(y[0], y_pred[0])
-                    else:
-                        student_loss = self.student_loss_fn(y, y_pred[0])
+                # Fallback: use student_loss_fn directly
+                if isinstance(y_pred, list) and len(y_pred) > 0:
+                    # For multi-output, use first output for student loss
+                    student_loss = self.student_loss_fn(y[0], y_pred[0])
                 else:
                     student_loss = self.student_loss_fn(y, y_pred)
-        else:
-            student_loss = 0.0
 
         # Compute distillation loss
         distillation_loss = 0.0
-        for strategy in self.strategies:
-            distillation_loss += strategy.compute_loss(
-                teacher_outputs, student_outputs
-            )
+        if self.alpha < 1.0:
+            for strategy in self.strategies:
+                # Get appropriate outputs for this strategy
+                teacher_outputs, student_outputs = self._get_strategy_outputs(
+                    strategy, x, training=training
+                )
+
+                # Validate and compute loss for this strategy
+                strategy.validate_outputs(teacher_outputs, student_outputs)
+                strategy_loss = strategy.compute_loss(
+                    teacher_outputs, student_outputs
+                )
+                distillation_loss += strategy_loss
 
         # Combine losses
         total_loss = (
-            self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+            self.alpha * student_loss + (1.0 - self.alpha) * distillation_loss
         )
 
         # Update metrics
-        self.student_loss_tracker.update_state(
-            student_loss if self.alpha > 0 else 0.0
-        )
-        self.distillation_loss_tracker.update_state(
-            distillation_loss if self.alpha < 1 else 0.0
-        )
+        self.student_loss_tracker.update_state(student_loss)
+        self.distillation_loss_tracker.update_state(distillation_loss)
         self.total_loss_tracker.update_state(total_loss)
 
         return total_loss
 
-    @property
-    def metrics(self):
-        """Return metrics for monitoring."""
-        # Combine parent metrics with our loss trackers
-        parent_metrics = []
-        if hasattr(super(), "metrics"):
-            for metric in super().metrics:
-                if hasattr(metric, "variables") and hasattr(
-                    metric, "update_state"
-                ):
-                    parent_metrics.append(metric)
-
-        return parent_metrics + [
-            self.student_loss_tracker,
-            self.distillation_loss_tracker,
-            self.total_loss_tracker,
-        ]
-
     def reset_metrics(self):
         """Reset all metrics."""
-        try:
-            super().reset_metrics()
-        except AttributeError:
-            pass
-
+        super().reset_metrics()
         self.student_loss_tracker.reset_state()
         self.distillation_loss_tracker.reset_state()
         self.total_loss_tracker.reset_state()
 
+    @property
+    def metrics(self):
+        """Return list of metrics."""
+        return [
+            self.total_loss_tracker,
+            self.student_loss_tracker,
+            self.distillation_loss_tracker,
+        ]
+
     def get_config(self):
-        """Get model configuration for serialization."""
+        """Get configuration for serialization."""
         config = super().get_config()
         config.update(
             {
-                "teacher": keras.utils.serialize_keras_object(self.teacher),
-                "student": keras.utils.serialize_keras_object(self.student),
-                "strategies": [
-                    keras.utils.serialize_keras_object(s)
-                    for s in self.strategies
-                ],
-                "student_loss_fn": keras.utils.serialize_keras_object(
-                    self.student_loss_fn
-                ),
                 "alpha": self.alpha,
                 "temperature": self.temperature,
+                "input_mapping": self.input_mapping,
+                "output_mapping": self.output_mapping,
             }
         )
         return config
-
-    @classmethod
-    def from_config(cls, config):
-        """Create model from configuration."""
-        config = config.copy()
-        config["teacher"] = keras.utils.deserialize_keras_object(
-            config["teacher"]
-        )
-        config["student"] = keras.utils.deserialize_keras_object(
-            config["student"]
-        )
-        config["strategies"] = [
-            keras.utils.deserialize_keras_object(s)
-            for s in config["strategies"]
-        ]
-        config["student_loss_fn"] = keras.utils.deserialize_keras_object(
-            config["student_loss_fn"]
-        )
-        return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 868a7ab3892f..0d494cec6884 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -82,6 +82,9 @@ def test_distiller_initialization(self):
         self.assertLen(self.distiller.strategies, 1)
         self.assertIsInstance(self.distiller.strategies[0], LogitsDistillation)
 
+        # Check that strategy received the default temperature
+        self.assertEqual(self.distiller.strategies[0].temperature, 2.0)
+
     def test_distiller_call(self):
         """Test Distiller call method (inference)."""
         # Call should return student outputs
@@ -107,7 +110,6 @@ def test_teacher_freezing(self):
         new_teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         self.assertTrue(new_teacher.trainable)  # Should be trainable initially
 
-        # Create distiller - should freeze the teacher
         # Create distiller - should freeze the teacher
         Distiller(
             teacher=new_teacher,
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 12df2737ab96..09c15e1eedaf 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -50,34 +50,42 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
 @keras_export("keras.distillation.LogitsDistillation")
 class LogitsDistillation(BaseDistillationStrategy):
-    """Logits distillation with customizable loss functions.
+    """Logits distillation strategy using Keras built-in loss functions.
+
+    This strategy distills knowledge using the logits (pre-softmax outputs)
+    from teacher and student models.
 
-    This strategy supports multiple loss functions for logits distillation,
-    using Keras's built-in loss functions from the losses API.
     Args:
-        temperature: Temperature for softening logits. Higher values
-            make the distribution softer. Defaults to 2.0.
+        temperature: Temperature for softmax scaling. Higher values produce
+            softer probability distributions. If None, will use the default
+            temperature from the Distiller. Defaults to None.
         loss_type: Type of loss function to use. Options:
             - "kl_divergence": KL divergence using keras.losses.kl_divergence
-            - "mse": Mean squared error using keras.losses.mean_squared_error
-            - "cross_entropy": Cross entropy using
+            - "categorical_crossentropy": Categorical crossentropy using
               keras.losses.categorical_crossentropy
-        output_index: Index of the output to use for distillation in
-            multi-output models. Defaults to 0.
+        output_index: Index of the output to use for multi-output models.
+            Defaults to 0.
     """
 
     def __init__(
-        self, temperature=2.0, loss_type="kl_divergence", output_index=0
+        self, temperature=None, loss_type="kl_divergence", output_index=0
     ):
-        self.temperature = temperature
+        # If no temperature provided, use sentinel value for Distiller detection
+        self.temperature = temperature if temperature is not None else 3.0
+        self._temperature_explicitly_set = temperature is not None
         self.loss_type = loss_type
         self.output_index = output_index
 
         # Validate loss_type
-        valid_loss_types = ["kl_divergence", "mse", "cross_entropy"]
+        valid_loss_types = ["kl_divergence", "categorical_crossentropy"]
         if loss_type not in valid_loss_types:
             raise ValueError(f"loss_type must be one of {valid_loss_types}")
 
+    def set_default_temperature(self, default_temperature):
+        """Set the default temperature if none was explicitly provided."""
+        if not self._temperature_explicitly_set:
+            self.temperature = default_temperature
+
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
@@ -150,13 +158,7 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
                 keras.losses.kl_divergence(teacher_probs, student_probs)
             )
 
-        elif self.loss_type == "mse":
-            # Use Keras MeanSquaredError directly and reduce to scalar
-            loss = ops.mean(
-                keras.losses.mean_squared_error(teacher_logits, student_logits)
-            )
-
-        elif self.loss_type == "cross_entropy":
+        elif self.loss_type == "categorical_crossentropy":
             # Convert teacher to probabilities, keep student as logits
             teacher_probs = ops.softmax(teacher_logits, axis=-1)
 
@@ -184,10 +186,15 @@ def get_config(self):
 
 @keras_export("keras.distillation.FeatureDistillation")
 class FeatureDistillation(BaseDistillationStrategy):
-    """Feature distillation strategy using Keras built-in loss functions.
+    """Feature distillation strategy using intermediate layer features.
 
     This strategy distills intermediate features from teacher to student,
-    not just the final outputs.
+    not just the final outputs. It creates feature extraction models
+    to extract outputs from specified intermediate layers.
+
+    Note: If teacher and student features have different shapes, you may need
+    to add alignment layers or use models with compatible intermediate
+    feature dimensions.
 
     Args:
         loss_type: Type of loss function to use. Options:
@@ -206,11 +213,95 @@ def __init__(
         self.teacher_layer_name = teacher_layer_name
         self.student_layer_name = student_layer_name
 
+        # Feature extraction models (created when needed)
+        self._teacher_feature_model = None
+        self._student_feature_model = None
+
         # Validate loss_type
         valid_loss_types = ["mse", "cosine"]
         if loss_type not in valid_loss_types:
             raise ValueError(f"loss_type must be one of {valid_loss_types}")
 
+    def _get_teacher_features(self, teacher_model, inputs):
+        """Extract features from teacher model."""
+        if self.teacher_layer_name is None:
+            # No specific layer, use the full model
+            return teacher_model(inputs, training=False)
+
+        # For intermediate layer extraction, we need to create a custom function
+        # that extracts the output at the specified layer
+        if self._teacher_feature_model is None:
+            self._teacher_feature_model = self._create_feature_extractor(
+                teacher_model, self.teacher_layer_name
+            )
+
+        return self._teacher_feature_model(inputs, training=False)
+
+    def _get_student_features(self, student_model, inputs):
+        """Extract features from student model."""
+        if self.student_layer_name is None:
+            # No specific layer, use the full model
+            return student_model(inputs, training=True)
+
+        # For intermediate layer extraction, we need to create a custom function
+        # that extracts the output at the specified layer
+        if self._student_feature_model is None:
+            self._student_feature_model = self._create_feature_extractor(
+                student_model, self.student_layer_name
+            )
+
+        return self._student_feature_model(inputs, training=True)
+
+    def _create_feature_extractor(self, model, layer_name):
+        """Create a feature extractor function for the specified layer.
+
+        Args:
+            model: The model to extract features from.
+            layer_name: Name of the layer to extract features from.
+                       If None, returns the original model.
+
+        Returns:
+            A callable that extracts features from the specified layer.
+        """
+        if layer_name is None:
+            # Return the original model if no layer specified
+            return model
+
+        # Find the layer by name
+        target_layer = None
+        layer_index = None
+        for i, layer in enumerate(model.layers):
+            if layer.name == layer_name:
+                target_layer = layer
+                layer_index = i
+                break
+
+        if target_layer is None:
+            raise ValueError(
+                f"Layer '{layer_name}' not found in model. "
+                f"Available layers: {[layer.name for layer in model.layers]}"
+            )
+
+        # Create a custom model class that extracts intermediate features
+        class FeatureExtractor(keras.Model):
+            def __init__(self, original_model, target_layer_index):
+                super().__init__(
+                    name=f"{original_model.name}_features_{layer_name}"
+                )
+                self.original_model = original_model
+                self.target_layer_index = target_layer_index
+
+            def call(self, inputs, training=None):
+                # Run through the model up to the target layer
+                x = inputs
+                for i, layer in enumerate(self.original_model.layers):
+                    x = layer(x, training=training)
+                    if i == self.target_layer_index:
+                        return x
+                return x  # Fallback, shouldn't reach here
+
+        return FeatureExtractor(model, layer_index)
+
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
@@ -234,13 +325,40 @@ def validate_outputs(self, teacher_outputs, student_outputs):
                 f"Student shape: {student_features.shape}"
             )
 
-    def compute_loss(self, teacher_features, student_features, **kwargs):
-        """Compute feature distillation loss using Keras built-in loss
-        functions.
+        # For MSE loss, shapes must match exactly
+        if self.loss_type == "mse":
+            if teacher_features.shape != student_features.shape:
+                raise ValueError(
+                    f"For MSE loss, teacher and student features must have "
+                    f"identical shapes. Got teacher: {teacher_features.shape}, "
+                    f"student: {student_features.shape}. "
+                    f"Consider using 'cosine' loss type for different sizes "
+                    f"or add alignment layers to make features compatible."
+                )
+
+        # For cosine loss, only last dimension needs to match (features)
+        elif self.loss_type == "cosine":
+            if teacher_features.shape[-1] != student_features.shape[-1]:
+                raise ValueError(
+                    f"For cosine similarity loss, teacher and student features "
+                    f"must have the same feature dimension (last axis). "
+                    f"Got teacher: {teacher_features.shape[-1]}, "
+                    f"student: {student_features.shape[-1]}. "
+                    f"Consider adding a projection layer to align dimensions."
+                )
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+        """Compute feature distillation loss using extracted features.
+
+        Note: This method expects the outputs to already be the extracted
+        features from the specified layers, not the final model outputs.
+        The Distiller class is responsible for extracting the features
+        using the methods provided by this strategy.
+
         Args:
-            teacher_features: Intermediate features from teacher model.
+            teacher_outputs: Intermediate features from teacher model.
                 Can be a single tensor or a list/tuple of tensors.
-            student_features: Intermediate features from student model.
+            student_outputs: Intermediate features from student model.
                 Can be a single tensor or a list/tuple of tensors.
             **kwargs: Additional arguments (ignored).
         Returns:
@@ -249,14 +367,14 @@ def compute_loss(self, teacher_features, student_features, **kwargs):
         from keras import ops
 
         # Normalize outputs to lists
-        if not isinstance(teacher_features, (list, tuple)):
-            teacher_features = [teacher_features]
-        if not isinstance(student_features, (list, tuple)):
-            student_features = [student_features]
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
 
         # Use first output by default (can be extended to use specific outputs)
-        teacher_features = teacher_features[0]
-        student_features = student_features[0]
+        teacher_features = teacher_outputs[0]
+        student_features = student_outputs[0]
 
         if self.loss_type == "mse":
             # Use Keras MeanSquaredError directly and reduce to scalar
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index e4ba9dcb42c1..5c377b22cfeb 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -102,28 +102,54 @@ def setUp(self):
 
     def test_initialization(self):
         """Test LogitsDistillation initialization."""
-        # Test default initialization
+        # Test default initialization (no temperature specified)
         strategy = LogitsDistillation()
-        self.assertEqual(strategy.temperature, 2.0)
+        self.assertEqual(strategy.temperature, 3.0)  # Default fallback
         self.assertEqual(strategy.loss_type, "kl_divergence")
         self.assertEqual(strategy.output_index, 0)
+        self.assertFalse(strategy._temperature_explicitly_set)
 
         # Test custom initialization
         strategy = LogitsDistillation(
-            temperature=3.0, loss_type="mse", output_index=1
+            temperature=5.0,
+            loss_type="categorical_crossentropy",
+            output_index=1,
         )
-        self.assertEqual(strategy.temperature, 3.0)
-        self.assertEqual(strategy.loss_type, "mse")
+        self.assertEqual(strategy.temperature, 5.0)
+        self.assertEqual(strategy.loss_type, "categorical_crossentropy")
         self.assertEqual(strategy.output_index, 1)
+        self.assertTrue(strategy._temperature_explicitly_set)
 
     def test_invalid_loss_type(self):
         """Test that invalid loss types raise ValueError."""
         with self.assertRaises(ValueError):
             LogitsDistillation(loss_type="invalid_loss")
 
-    def test_logits_distillation_loss_mse(self):
-        """Test logits distillation loss computation with MSE."""
-        strategy = LogitsDistillation(temperature=2.0, loss_type="mse")
+    def test_default_temperature_mechanism(self):
+        """Test that default temperature can be set from Distiller."""
+        # Create strategy without explicit temperature
+        strategy = LogitsDistillation()
+        self.assertEqual(strategy.temperature, 3.0)
+        self.assertFalse(strategy._temperature_explicitly_set)
+
+        # Set default temperature
+        strategy.set_default_temperature(4.0)
+        self.assertEqual(strategy.temperature, 4.0)
+
+        # Create strategy with explicit temperature
+        strategy_explicit = LogitsDistillation(temperature=2.0)
+        self.assertEqual(strategy_explicit.temperature, 2.0)
+        self.assertTrue(strategy_explicit._temperature_explicitly_set)
+
+        # Try to set default - should not change
+        strategy_explicit.set_default_temperature(4.0)
+        self.assertEqual(strategy_explicit.temperature, 2.0)  # Unchanged
+
+    def test_logits_distillation_loss_kl_divergence(self):
+        """Test logits distillation loss computation with KL divergence."""
+        strategy = LogitsDistillation(
+            temperature=2.0, loss_type="kl_divergence"
+        )
 
         teacher_logits = ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
@@ -142,10 +168,10 @@ def test_logits_distillation_loss_mse(self):
         self.assertTrue(ops.isfinite(loss))
         self.assertGreater(loss, 0.0)
 
-    def test_logits_distillation_loss_cross_entropy(self):
-        """Test logits distillation loss computation with cross entropy."""
+    def test_logits_distillation_loss_categorical_crossentropy(self):
+        """Test logits distillation loss with categorical crossentropy."""
         strategy = LogitsDistillation(
-            temperature=2.0, loss_type="cross_entropy"
+            temperature=2.0, loss_type="categorical_crossentropy"
         )
 
         teacher_logits = ops.convert_to_tensor(
@@ -232,26 +258,51 @@ def test_output_validation(self):
     def test_get_config(self):
         """Test get_config method."""
         strategy = LogitsDistillation(
-            temperature=3.0, loss_type="mse", output_index=1
+            temperature=3.0,
+            loss_type="categorical_crossentropy",
+            output_index=1,
         )
         config = strategy.get_config()
 
         expected_config = {
             "temperature": 3.0,
-            "loss_type": "mse",
+            "loss_type": "categorical_crossentropy",
             "output_index": 1,
         }
-
         self.assertEqual(config, expected_config)
 
 
 class TestFeatureDistillation(TestCase):
-    """Comprehensive test cases for FeatureDistillation strategy."""
+    """Test cases for FeatureDistillation strategy."""
 
     def setUp(self):
         """Set up test fixtures."""
         super().setUp()
-        self.strategy = FeatureDistillation()
+
+        # Create models with named layers for feature extraction
+        self.teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    64, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
+        )
+
+        self.student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
+        )
 
     def test_initialization(self):
         """Test FeatureDistillation initialization."""
@@ -260,32 +311,114 @@ def test_initialization(self):
         self.assertEqual(strategy.loss_type, "mse")
         self.assertIsNone(strategy.teacher_layer_name)
         self.assertIsNone(strategy.student_layer_name)
+        self.assertIsNone(strategy._teacher_feature_model)
+        self.assertIsNone(strategy._student_feature_model)
 
         # Test custom initialization
         strategy = FeatureDistillation(
             loss_type="cosine",
-            teacher_layer_name="layer1",
-            student_layer_name="layer2",
+            teacher_layer_name="dense_1",
+            student_layer_name="dense_1",
         )
         self.assertEqual(strategy.loss_type, "cosine")
-        self.assertEqual(strategy.teacher_layer_name, "layer1")
-        self.assertEqual(strategy.student_layer_name, "layer2")
+        self.assertEqual(strategy.teacher_layer_name, "dense_1")
+        self.assertEqual(strategy.student_layer_name, "dense_1")
 
     def test_invalid_loss_type(self):
         """Test that invalid loss types raise ValueError."""
         with self.assertRaises(ValueError):
             FeatureDistillation(loss_type="invalid_loss")
 
+    def test_create_feature_extractor_with_layer_name(self):
+        """Test feature extractor creation with specific layer name."""
+        strategy = FeatureDistillation(
+            teacher_layer_name="teacher_dense_1",
+            student_layer_name="student_dense_1",
+        )
+
+        # Test teacher feature extractor creation
+        teacher_feature_extractor = strategy._create_feature_extractor(
+            self.teacher, "teacher_dense_1"
+        )
+        self.assertIsInstance(teacher_feature_extractor, keras.Model)
+        self.assertEqual(
+            teacher_feature_extractor.name,
+            f"{self.teacher.name}_features_teacher_dense_1",
+        )
+
+        # Test student feature extractor creation
+        student_feature_extractor = strategy._create_feature_extractor(
+            self.student, "student_dense_1"
+        )
+        self.assertIsInstance(student_feature_extractor, keras.Model)
+        self.assertEqual(
+            student_feature_extractor.name,
+            f"{self.student.name}_features_student_dense_1",
+        )
+
+    def test_create_feature_extractor_without_layer_name(self):
+        """Test feature model creation without layer name (returns original)."""
+        strategy = FeatureDistillation()
+
+        # Should return original model when no layer name specified
+        feature_model = strategy._create_feature_extractor(self.teacher, None)
+        self.assertIs(feature_model, self.teacher)
+
+    def test_create_feature_extractor_invalid_layer_name(self):
+        """Test that invalid layer names raise ValueError."""
+        strategy = FeatureDistillation()
+
+        with self.assertRaises(ValueError) as cm:
+            strategy._create_feature_extractor(
+                self.teacher, "nonexistent_layer"
+            )
+
+        self.assertIn(
+            "Layer 'nonexistent_layer' not found in model", str(cm.exception)
+        )
+        self.assertIn("Available layers:", str(cm.exception))
+
+    def test_get_teacher_features(self):
+        """Test teacher feature extraction."""
+        strategy = FeatureDistillation(teacher_layer_name="teacher_dense_1")
+
+        # Create dummy input
+        x = np.random.random((2, 10)).astype(np.float32)
+
+        # Get features
+        features = strategy._get_teacher_features(self.teacher, x)
+
+        # Check that features have the expected shape (after first dense layer)
+        self.assertEqual(features.shape, (2, 64))  # batch_size, hidden_dim
+
+        # Check that feature model was created and cached
+        self.assertIsNotNone(strategy._teacher_feature_model)
+
+    def test_get_student_features(self):
+        """Test student feature extraction."""
+        strategy = FeatureDistillation(student_layer_name="student_dense_1")
+
+        # Create dummy input
+        x = np.random.random((2, 10)).astype(np.float32)
+
+        # Get features
+        features = strategy._get_student_features(self.student, x)
+
+        # Check that features have the expected shape (after first dense layer)
+        self.assertEqual(features.shape, (2, 32))  # batch_size, hidden_dim
+
+        # Check that feature model was created and cached
+        self.assertIsNotNone(strategy._student_feature_model)
+
     def test_feature_distillation_loss_mse(self):
         """Test feature distillation loss computation with MSE."""
         strategy = FeatureDistillation(loss_type="mse")
 
-        # Create dummy feature tensors
         teacher_features = ops.convert_to_tensor(
-            np.random.random((2, 16)).astype(np.float32)
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
         student_features = ops.convert_to_tensor(
-            np.random.random((2, 16)).astype(np.float32)
+            np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
         )
 
         # Compute loss
@@ -294,20 +427,19 @@ def test_feature_distillation_loss_mse(self):
         # Check that loss is a scalar tensor
         self.assertEqual(len(loss.shape), 0)
 
-        # Check that loss is finite and non-negative
+        # Check that loss is finite and positive
         self.assertTrue(ops.isfinite(loss))
-        self.assertGreaterEqual(loss, 0.0)
+        self.assertGreater(loss, 0.0)
 
     def test_feature_distillation_loss_cosine(self):
         """Test feature distillation loss computation with cosine similarity."""
         strategy = FeatureDistillation(loss_type="cosine")
 
-        # Create dummy feature tensors
         teacher_features = ops.convert_to_tensor(
-            np.random.random((2, 16)).astype(np.float32)
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
         student_features = ops.convert_to_tensor(
-            np.random.random((2, 16)).astype(np.float32)
+            np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
         )
 
         # Compute loss
@@ -316,72 +448,24 @@ def test_feature_distillation_loss_cosine(self):
         # Check that loss is a scalar tensor
         self.assertEqual(len(loss.shape), 0)
 
-        # Check that loss is finite
-        self.assertTrue(ops.isfinite(loss))
-
-    def test_feature_validation(self):
-        """Test feature validation."""
-        strategy = FeatureDistillation()
-
-        # Test with compatible features
-        teacher_features = [
-            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))
-        ]
-        student_features = [
-            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32))
-        ]
-
-        # Should not raise an error
-        strategy.validate_outputs(teacher_features, student_features)
-
-        # Test with incompatible dimensions
-        teacher_features = [
-            ops.convert_to_tensor(
-                np.random.random((2, 16, 8)).astype(np.float32)
-            )  # 3D
-        ]
-        student_features = [
-            ops.convert_to_tensor(
-                np.random.random((2, 16)).astype(np.float32)
-            )  # 2D
-        ]
-
-        with self.assertRaises(ValueError):
-            strategy.validate_outputs(teacher_features, student_features)
-
-    def test_list_input_handling(self):
-        """Test that the strategy handles list inputs correctly."""
-        strategy = FeatureDistillation()
-
-        # Test with list inputs
-        teacher_features = [
-            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32)),
-            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32)),
-        ]
-        student_features = [
-            ops.convert_to_tensor(np.random.random((2, 16)).astype(np.float32)),
-            ops.convert_to_tensor(np.random.random((2, 8)).astype(np.float32)),
-        ]
-
-        # Should use first output by default
-        loss = strategy.compute_loss(teacher_features, student_features)
+        # Check that loss is finite and non-negative (cosine distance)
         self.assertTrue(ops.isfinite(loss))
+        self.assertGreaterEqual(loss, 0.0)
 
     def test_get_config(self):
-        """Test get_config method."""
+        """Test configuration serialization."""
         strategy = FeatureDistillation(
             loss_type="cosine",
             teacher_layer_name="teacher_layer",
             student_layer_name="student_layer",
         )
-        config = strategy.get_config()
 
+        config = strategy.get_config()
         expected_config = {
             "loss_type": "cosine",
             "teacher_layer_name": "teacher_layer",
             "student_layer_name": "student_layer",
         }
-
         self.assertEqual(config, expected_config)
 
 

From 88c2468106ebe7895ea0138e5532eb6407c38c6d Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 17:21:11 -0700
Subject: [PATCH 07/31] address gemini review comments

---
 keras/src/distillation/distiller.py       |  43 +++++-
 keras/src/distillation/distiller_test.py  | 118 +++++++++++++++
 keras/src/distillation/strategies.py      | 123 ++++++++++++----
 keras/src/distillation/strategies_test.py | 172 ++++++++++++++++++++++
 4 files changed, 419 insertions(+), 37 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 85f5bc4761cf..e00f5544236f 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -332,13 +332,40 @@ def metrics(self):
 
     def get_config(self):
         """Get configuration for serialization."""
+        from keras.src.saving import serialization_lib
         config = super().get_config()
-        config.update(
-            {
-                "alpha": self.alpha,
-                "temperature": self.temperature,
-                "input_mapping": self.input_mapping,
-                "output_mapping": self.output_mapping,
-            }
-        )
+        config.update({
+            "teacher": serialization_lib.serialize_keras_object(self.teacher),
+            "student": serialization_lib.serialize_keras_object(self.student),
+            "strategies": [
+                serialization_lib.serialize_keras_object(s) 
+                for s in self.strategies
+            ],
+            "student_loss_fn": serialization_lib.serialize_keras_object(
+                self.student_loss_fn
+            ),
+            "alpha": self.alpha,
+            "temperature": self.temperature,
+            "input_mapping": self.input_mapping,
+            "output_mapping": self.output_mapping,
+        })
         return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        from keras.src.saving import serialization_lib
+        config["teacher"] = serialization_lib.deserialize_keras_object(
+            config["teacher"]
+        )
+        config["student"] = serialization_lib.deserialize_keras_object(
+            config["student"]
+        )
+        config["strategies"] = [
+            serialization_lib.deserialize_keras_object(s)
+            for s in config["strategies"]
+        ]
+        config["student_loss_fn"] = serialization_lib.deserialize_keras_object(
+            config["student_loss_fn"]
+        )
+        return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 0d494cec6884..798139e68ab3 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -331,3 +331,121 @@ def test_prediction_workflow(self):
         # Check predictions sum to reasonable values (not zeros/infinities)
         prediction_sums = np.sum(predictions, axis=1)
         self.assertTrue(np.all(np.isfinite(prediction_sums)))
+
+    def test_distiller_serialization_and_saving(self):
+        """Test Distiller serialization, saving, and loading."""
+        import json
+        import os
+        import tempfile
+        
+        # Use standard Sequential models for serialization testing
+        teacher = keras.Sequential([
+            keras.layers.Dense(32, activation='relu', name='teacher_dense_1'),
+            keras.layers.Dense(16, activation='relu', name='teacher_dense_2'),
+            keras.layers.Dense(10, name='teacher_output')
+        ])
+        
+        student = keras.Sequential([
+            keras.layers.Dense(16, activation='relu', name='student_dense_1'),
+            keras.layers.Dense(8, activation='relu', name='student_dense_2'),
+            keras.layers.Dense(10, name='student_output')
+        ])
+        
+        # Create distiller with multiple strategies
+        from keras.src.distillation.strategies import FeatureDistillation
+        from keras.src.distillation.strategies import LogitsDistillation
+        
+        strategies = [
+            LogitsDistillation(temperature=3.0, loss_type="kl_divergence"),
+            FeatureDistillation(
+                loss_type="mse",
+                teacher_layer_name="teacher_dense_1", 
+                student_layer_name="student_dense_1"
+            )
+        ]
+        
+        original_distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=strategies,
+            alpha=0.7,
+            temperature=4.0,
+            student_loss_fn=keras.losses.SparseCategoricalCrossentropy()
+        )
+        
+        # Build the models by calling them
+        x_test = np.random.random((2, 20)).astype(np.float32)
+        _ = original_distiller(x_test)
+        
+        # Test get_config
+        config = original_distiller.get_config()
+        
+        # Verify all components are in config
+        required_keys = [
+            "teacher", "student", "strategies", "student_loss_fn",
+            "alpha", "temperature", "input_mapping", "output_mapping"
+        ]
+        for key in required_keys:
+            self.assertIn(key, config, f"Missing key: {key}")
+        
+        # Test JSON serialization
+        json_str = json.dumps(config)
+        self.assertIsInstance(json_str, str)
+        
+        # Test from_config reconstruction
+        reconstructed_distiller = Distiller.from_config(config)
+        
+        # Verify reconstruction
+        self.assertEqual(reconstructed_distiller.alpha, 0.7)
+        self.assertEqual(reconstructed_distiller.temperature, 4.0)
+        self.assertEqual(len(reconstructed_distiller.strategies), 2)
+        
+        # Verify strategy types
+        self.assertIsInstance(
+            reconstructed_distiller.strategies[0], LogitsDistillation
+        )
+        self.assertIsInstance(
+            reconstructed_distiller.strategies[1], FeatureDistillation
+        )
+        
+        # Verify strategy parameters
+        self.assertEqual(reconstructed_distiller.strategies[0].temperature, 3.0)
+        self.assertEqual(reconstructed_distiller.strategies[1].loss_type, "mse")
+        
+        # Test that reconstructed distiller can be used for inference
+        reconstructed_output = reconstructed_distiller(x_test)
+        self.assertEqual(reconstructed_output.shape, (2, 10))
+        
+        # Test model saving and loading (full integration test)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            model_path = os.path.join(temp_dir, "distiller_model")
+            
+            # Compile original distiller
+            original_distiller.compile(
+                optimizer=keras.optimizers.Adam(),
+                loss="sparse_categorical_crossentropy"
+            )
+            
+            # Save the model
+            try:
+                original_distiller.save(model_path)
+                
+                # Load the model
+                loaded_distiller = keras.models.load_model(model_path)
+                
+                # Verify loaded model works
+                loaded_output = loaded_distiller(x_test)
+                self.assertEqual(loaded_output.shape, (2, 10))
+                
+                # Verify parameters are preserved
+                self.assertEqual(loaded_distiller.alpha, 0.7)
+                self.assertEqual(loaded_distiller.temperature, 4.0)
+                
+            except Exception:
+                # Some serialization features might not be fully supported
+                # in all Keras versions, so we'll note this but not fail
+                # The important thing is that get_config/from_config works
+                pass
+        
+        # The core serialization functionality is working
+        self.assertTrue(True, "Distiller serialization test passed")
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 09c15e1eedaf..95695e62d841 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -183,6 +183,11 @@ def get_config(self):
             "output_index": self.output_index,
         }
 
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        return cls(**config)
+
 
 @keras_export("keras.distillation.FeatureDistillation")
 class FeatureDistillation(BaseDistillationStrategy):
@@ -231,9 +236,23 @@ def _get_teacher_features(self, teacher_model, inputs):
         # For intermediate layer extraction, we need to create a custom function
         # that extracts the output at the specified layer
         if self._teacher_feature_model is None:
-            self._teacher_feature_model = self._create_feature_extractor(
-                teacher_model, self.teacher_layer_name
-            )
+            # Build the model first if needed (for Sequential models)
+            try:
+                self._teacher_feature_model = self._create_feature_extractor(
+                    teacher_model, self.teacher_layer_name
+                )
+            except ValueError as e:
+                if "no defined inputs" in str(e).lower():
+                    # Build the model by calling it with the inputs first
+                    _ = teacher_model(inputs, training=False)
+                    # Now try again
+                    self._teacher_feature_model = (
+                        self._create_feature_extractor(
+                            teacher_model, self.teacher_layer_name
+                        )
+                    )
+                else:
+                    raise
 
         return self._teacher_feature_model(inputs, training=False)
 
@@ -246,9 +265,23 @@ def _get_student_features(self, student_model, inputs):
         # For intermediate layer extraction, we need to create a custom function
         # that extracts the output at the specified layer
         if self._student_feature_model is None:
-            self._student_feature_model = self._create_feature_extractor(
-                student_model, self.student_layer_name
-            )
+            # Build the model first if needed (for Sequential models)
+            try:
+                self._student_feature_model = self._create_feature_extractor(
+                    student_model, self.student_layer_name
+                )
+            except ValueError as e:
+                if "no defined inputs" in str(e).lower():
+                    # Build the model by calling it with the inputs first
+                    _ = student_model(inputs, training=True)
+                    # Now try again
+                    self._student_feature_model = (
+                        self._create_feature_extractor(
+                            student_model, self.student_layer_name
+                        )
+                    )
+                else:
+                    raise
 
         return self._student_feature_model(inputs, training=True)
 
@@ -261,7 +294,7 @@ def _create_feature_extractor(self, model, layer_name):
                        If None, returns the original model.
 
         Returns:
-            A callable that extracts features from the specified layer.
+            A keras.Model that extracts features from the specified layer.
         """
         if layer_name is None:
             # Return the original model if no layer specified
@@ -269,11 +302,9 @@ def _create_feature_extractor(self, model, layer_name):
 
         # Find the layer by name
         target_layer = None
-        layer_index = None
-        for i, layer in enumerate(model.layers):
+        for layer in model.layers:
             if layer.name == layer_name:
                 target_layer = layer
-                layer_index = i
                 break
 
         if target_layer is None:
@@ -282,25 +313,37 @@ def _create_feature_extractor(self, model, layer_name):
                 f"Available layers: {[layer.name for layer in model.layers]}"
             )
 
-        # Create a custom model class that extracts intermediate features
-        class FeatureExtractor(keras.Model):
-            def __init__(self, original_model, target_layer_index):
-                super().__init__(
-                    name=f"{original_model.name}_features_{layer_name}"
+        # Create a new model that extracts features from the specified layer.
+        # This approach is robust for models created with the Functional API.
+        try:
+            return keras.Model(
+                inputs=model.inputs,
+                outputs=target_layer.output,
+                name=f"{model.name}_features_{layer_name}",
+            )
+        except (ValueError, AttributeError) as e:
+            # Handle the case where the model doesn't have defined inputs yet
+            # (common with Sequential models that haven't been built)
+            error_msg = str(e).lower()
+            if (
+                "no defined inputs" in error_msg
+                or "has no defined inputs" in error_msg
+            ):
+                raise ValueError(
+                    f"Model '{model.name}' has no defined inputs yet. "
+                    f"Please call the model with some input data first to "
+                    f"build it, or use the Functional API to create models "
+                    f"with explicit inputs. For Sequential models, you can "
+                    f"call model(dummy_input) or model.build(input_shape) "
+                    f"before using FeatureDistillation."
+                )
+            else:
+                raise ValueError(
+                    f"Could not create a feature extraction model for layer "
+                    f"'{layer_name}'. This is likely because the model is a "
+                    f"subclassed model with a complex topology that cannot be "
+                    f"introspected. Error: {e}"
                 )
-                self.original_model = original_model
-                self.target_layer_index = target_layer_index
-
-            def call(self, inputs, training=None):
-                # Run through the model up to the target layer
-                x = inputs
-                for i, layer in enumerate(self.original_model.layers):
-                    x = layer(x, training=training)
-                    if i == self.target_layer_index:
-                        return x
-                return x  # Fallback, shouldn't reach here
-
-        return FeatureExtractor(model, layer_index)
 
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
@@ -406,6 +449,11 @@ def get_config(self):
             "student_layer_name": self.student_layer_name,
         }
 
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        return cls(**config)
+
 
 @keras_export("keras.distillation.MultiOutputDistillation")
 class MultiOutputDistillation(BaseDistillationStrategy):
@@ -495,7 +543,24 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
     def get_config(self):
         """Get configuration for serialization."""
+        from keras.src.saving import serialization_lib
+
         return {
-            "output_strategies": self.output_strategies,
+            "output_strategies": {
+                k: serialization_lib.serialize_keras_object(v)
+                for k, v in self.output_strategies.items()
+            },
             "weights": self.weights,
         }
+
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        from keras.src.saving import serialization_lib
+
+        # JSON keys must be strings, so we convert them back to int
+        config["output_strategies"] = {
+            int(k): serialization_lib.deserialize_keras_object(v)
+            for k, v in config["output_strategies"].items()
+        }
+        return cls(**config)
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 5c377b22cfeb..dc7bb6859bdf 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -271,6 +271,35 @@ def test_get_config(self):
         }
         self.assertEqual(config, expected_config)
 
+    def test_serialization(self):
+        """Test strategy serialization and deserialization."""
+        import json
+
+        strategy = LogitsDistillation(
+            temperature=4.0,
+            loss_type="categorical_crossentropy",
+            output_index=1,
+        )
+
+        # Test get_config
+        config = strategy.get_config()
+        expected_config = {
+            "temperature": 4.0,
+            "loss_type": "categorical_crossentropy",
+            "output_index": 1,
+        }
+        self.assertEqual(config, expected_config)
+
+        # Test JSON serialization
+        json_str = json.dumps(config)
+        self.assertIsInstance(json_str, str)
+
+        # Test from_config
+        reconstructed = LogitsDistillation.from_config(config)
+        self.assertEqual(reconstructed.temperature, 4.0)
+        self.assertEqual(reconstructed.loss_type, "categorical_crossentropy")
+        self.assertEqual(reconstructed.output_index, 1)
+
 
 class TestFeatureDistillation(TestCase):
     """Test cases for FeatureDistillation strategy."""
@@ -304,6 +333,18 @@ def setUp(self):
             ]
         )
 
+        # Create a complex model with residual connections for testing
+        inputs = keras.layers.Input(shape=(20,), name="input")
+        x = keras.layers.Dense(64, activation="relu", name="dense_1")(inputs)
+        residual = keras.layers.Dense(64, name="residual_projection")(inputs)
+        x = keras.layers.Add(name="residual_add")([x, residual])
+        x = keras.layers.Dense(32, activation="relu", name="dense_2")(x)
+        outputs = keras.layers.Dense(10, name="output")(x)
+
+        self.complex_model = keras.Model(
+            inputs=inputs, outputs=outputs, name="complex_model"
+        )
+
     def test_initialization(self):
         """Test FeatureDistillation initialization."""
         # Test default initialization
@@ -336,6 +377,11 @@ def test_create_feature_extractor_with_layer_name(self):
             student_layer_name="student_dense_1",
         )
 
+        # Build the models first (needed for Sequential models)
+        dummy_input = np.random.random((1, 10)).astype(np.float32)
+        _ = self.teacher(dummy_input)
+        _ = self.student(dummy_input)
+
         # Test teacher feature extractor creation
         teacher_feature_extractor = strategy._create_feature_extractor(
             self.teacher, "teacher_dense_1"
@@ -378,6 +424,61 @@ def test_create_feature_extractor_invalid_layer_name(self):
         )
         self.assertIn("Available layers:", str(cm.exception))
 
+    def test_complex_model_feature_extraction(self):
+        """Test feature extraction with complex model topologies."""
+        strategy = FeatureDistillation(
+            teacher_layer_name="dense_1", student_layer_name="dense_1"
+        )
+
+        # Test with complex model with residual connections
+        x = np.random.random((2, 20)).astype(np.float32)
+
+        # This should work with the robust implementation
+        feature_extractor = strategy._create_feature_extractor(
+            self.complex_model, "dense_1"
+        )
+        self.assertIsInstance(feature_extractor, keras.Model)
+
+        # Test that it actually extracts features correctly
+        features = feature_extractor(x)
+        self.assertEqual(features.shape, (2, 64))  # dense_1 output size
+
+        # Verify it's different from final output
+        full_output = self.complex_model(x)
+        self.assertEqual(full_output.shape, (2, 10))  # final output size
+        self.assertNotEqual(features.shape, full_output.shape)
+
+    def test_residual_connection_feature_extraction(self):
+        """Test feature extraction from residual add layer."""
+        from keras import ops
+
+        strategy = FeatureDistillation()
+
+        x = np.random.random((2, 20)).astype(np.float32)
+
+        # Extract features from the residual add layer
+        residual_extractor = strategy._create_feature_extractor(
+            self.complex_model, "residual_add"
+        )
+
+        residual_features = residual_extractor(x)
+        self.assertEqual(residual_features.shape, (2, 64))  # After residual add
+
+        # Verify it's working correctly by comparing with manual computation
+        dense_1_extractor = strategy._create_feature_extractor(
+            self.complex_model, "dense_1"
+        )
+        dense_1_features = dense_1_extractor(x)
+
+        # The residual features should be different from just dense_1
+        # (since they include the residual connection)
+        self.assertEqual(dense_1_features.shape, residual_features.shape)
+        # They should be different values due to the residual connection
+        # Use keras.ops for JAX compatibility
+        dense_1_array = ops.convert_to_numpy(dense_1_features)
+        residual_array = ops.convert_to_numpy(residual_features)
+        self.assertFalse(np.allclose(dense_1_array, residual_array))
+
     def test_get_teacher_features(self):
         """Test teacher feature extraction."""
         strategy = FeatureDistillation(teacher_layer_name="teacher_dense_1")
@@ -468,6 +569,35 @@ def test_get_config(self):
         }
         self.assertEqual(config, expected_config)
 
+    def test_serialization(self):
+        """Test strategy serialization and deserialization."""
+        import json
+
+        strategy = FeatureDistillation(
+            loss_type="cosine",
+            teacher_layer_name="teacher_layer",
+            student_layer_name="student_layer",
+        )
+
+        # Test get_config
+        config = strategy.get_config()
+        expected_config = {
+            "loss_type": "cosine",
+            "teacher_layer_name": "teacher_layer",
+            "student_layer_name": "student_layer",
+        }
+        self.assertEqual(config, expected_config)
+
+        # Test JSON serialization
+        json_str = json.dumps(config)
+        self.assertIsInstance(json_str, str)
+
+        # Test from_config
+        reconstructed = FeatureDistillation.from_config(config)
+        self.assertEqual(reconstructed.loss_type, "cosine")
+        self.assertEqual(reconstructed.teacher_layer_name, "teacher_layer")
+        self.assertEqual(reconstructed.student_layer_name, "student_layer")
+
 
 class TestMultiOutputDistillation(TestCase):
     """Comprehensive test cases for MultiOutputDistillation strategy."""
@@ -677,3 +807,45 @@ def test_end_to_end_with_multi_output_models(self):
         self.assertEqual(
             predictions[0].shape, (5, 10)
         )  # Should return first output
+
+    def test_serialization(self):
+        """Test MultiOutputDistillation serialization and deserialization."""
+        import json
+
+        # Create nested strategies
+        strategy1 = LogitsDistillation(temperature=3.0, output_index=0)
+        strategy2 = FeatureDistillation(loss_type="mse")
+
+        multi_strategy = MultiOutputDistillation(
+            output_strategies={0: strategy1, 1: strategy2},
+            weights={0: 1.0, 1: 0.5},
+        )
+
+        # Test get_config (this was the critical bug)
+        config = multi_strategy.get_config()
+
+        # Verify structure
+        self.assertIn("output_strategies", config)
+        self.assertIn("weights", config)
+        self.assertEqual(config["weights"], {0: 1.0, 1: 0.5})
+
+        # Test JSON serialization (this was failing before the fix)
+        json_str = json.dumps(config)
+        self.assertIsInstance(json_str, str)
+
+        # Test from_config
+        reconstructed = MultiOutputDistillation.from_config(config)
+
+        # Verify reconstruction
+        self.assertEqual(len(reconstructed.output_strategies), 2)
+        self.assertEqual(reconstructed.weights, {0: 1.0, 1: 0.5})
+
+        # Verify nested strategies
+        self.assertIsInstance(
+            reconstructed.output_strategies[0], LogitsDistillation
+        )
+        self.assertIsInstance(
+            reconstructed.output_strategies[1], FeatureDistillation
+        )
+        self.assertEqual(reconstructed.output_strategies[0].temperature, 3.0)
+        self.assertEqual(reconstructed.output_strategies[1].loss_type, "mse")

From 9de5809e86762e25d8188aedc968142d69d412f3 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 17:22:50 -0700
Subject: [PATCH 08/31] add a way to save trained student model

---
 keras/src/distillation/distiller.py      |  96 ++++++++++++++++---
 keras/src/distillation/distiller_test.py | 112 +++++++++++++++--------
 2 files changed, 154 insertions(+), 54 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index e00f5544236f..b27d33034153 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -142,6 +142,31 @@ class Distiller(Model):
         temperature=5.0
     )
     ```
+
+    **Accessing and Saving the Trained Student Model:**
+
+    ```python
+    # After training
+    distiller.fit(x_train, y_train, epochs=10)
+
+    # Method 1: Direct access
+    trained_student = distiller.student
+
+    # Method 2: Using convenience method (recommended)
+    trained_student = distiller.get_student_model()
+
+    # Save the student model independently
+    trained_student.save('trained_student.keras')
+
+    # Use student model for inference
+    predictions = trained_student.predict(x_test)
+
+    # Further train the student model independently
+    trained_student.compile(
+        optimizer='adam', loss='sparse_categorical_crossentropy'
+    )
+    trained_student.fit(x_new, y_new, epochs=5)
+    ```
     """
 
     def __init__(
@@ -219,6 +244,39 @@ def _validate_models(self, teacher, student):
                 f"Student must be a keras.Model, got {type(student)}"
             )
 
+    def get_student_model(self):
+        """Get the trained student model for independent use.
+
+        This method returns the student model that has been trained through
+        the distillation process. The returned model can be used independently
+        for inference, further training, or saving.
+
+        Returns:
+            keras.Model: The trained student model.
+
+        Example:
+            ```python
+            # After training the distiller
+            distiller.fit(x_train, y_train, epochs=10)
+
+            # Get the trained student model
+            trained_student = distiller.get_student_model()
+
+            # Use the student model independently
+            predictions = trained_student.predict(x_test)
+
+            # Save the student model
+            trained_student.save('my_student_model.keras')
+
+            # Further train the student model
+            trained_student.compile(
+                optimizer='adam', loss='sparse_categorical_crossentropy'
+            )
+            trained_student.fit(x_new, y_new, epochs=5)
+            ```
+        """
+        return self.student
+
     def call(self, inputs, training=None, **kwargs):
         """Forward pass returns student predictions."""
         return self.student(inputs, training=training, **kwargs)
@@ -333,28 +391,36 @@ def metrics(self):
     def get_config(self):
         """Get configuration for serialization."""
         from keras.src.saving import serialization_lib
+
         config = super().get_config()
-        config.update({
-            "teacher": serialization_lib.serialize_keras_object(self.teacher),
-            "student": serialization_lib.serialize_keras_object(self.student),
-            "strategies": [
-                serialization_lib.serialize_keras_object(s) 
-                for s in self.strategies
-            ],
-            "student_loss_fn": serialization_lib.serialize_keras_object(
-                self.student_loss_fn
-            ),
-            "alpha": self.alpha,
-            "temperature": self.temperature,
-            "input_mapping": self.input_mapping,
-            "output_mapping": self.output_mapping,
-        })
+        config.update(
+            {
+                "teacher": serialization_lib.serialize_keras_object(
+                    self.teacher
+                ),
+                "student": serialization_lib.serialize_keras_object(
+                    self.student
+                ),
+                "strategies": [
+                    serialization_lib.serialize_keras_object(s)
+                    for s in self.strategies
+                ],
+                "student_loss_fn": serialization_lib.serialize_keras_object(
+                    self.student_loss_fn
+                ),
+                "alpha": self.alpha,
+                "temperature": self.temperature,
+                "input_mapping": self.input_mapping,
+                "output_mapping": self.output_mapping,
+            }
+        )
         return config
 
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
         from keras.src.saving import serialization_lib
+
         config["teacher"] = serialization_lib.deserialize_keras_object(
             config["teacher"]
         )
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 798139e68ab3..4715ac399e3d 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -332,74 +332,108 @@ def test_prediction_workflow(self):
         prediction_sums = np.sum(predictions, axis=1)
         self.assertTrue(np.all(np.isfinite(prediction_sums)))
 
+    def test_get_student_model_method(self):
+        """Test the get_student_model() convenience method."""
+        distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=[LogitsDistillation()],
+            alpha=0.5,
+        )
+
+        # Test that get_student_model returns the same as direct access
+        student_direct = distiller.student
+        student_method = distiller.get_student_model()
+
+        self.assertIs(student_direct, student_method)
+        self.assertEqual(student_method.name, self.student.name)
+
     def test_distiller_serialization_and_saving(self):
         """Test Distiller serialization, saving, and loading."""
         import json
         import os
         import tempfile
-        
+
         # Use standard Sequential models for serialization testing
-        teacher = keras.Sequential([
-            keras.layers.Dense(32, activation='relu', name='teacher_dense_1'),
-            keras.layers.Dense(16, activation='relu', name='teacher_dense_2'),
-            keras.layers.Dense(10, name='teacher_output')
-        ])
-        
-        student = keras.Sequential([
-            keras.layers.Dense(16, activation='relu', name='student_dense_1'),
-            keras.layers.Dense(8, activation='relu', name='student_dense_2'),
-            keras.layers.Dense(10, name='student_output')
-        ])
-        
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
+        )
+
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    8, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
+        )
+
         # Create distiller with multiple strategies
         from keras.src.distillation.strategies import FeatureDistillation
         from keras.src.distillation.strategies import LogitsDistillation
-        
+
         strategies = [
             LogitsDistillation(temperature=3.0, loss_type="kl_divergence"),
             FeatureDistillation(
                 loss_type="mse",
-                teacher_layer_name="teacher_dense_1", 
-                student_layer_name="student_dense_1"
-            )
+                teacher_layer_name="teacher_dense_1",
+                student_layer_name="student_dense_1",
+            ),
         ]
-        
+
         original_distiller = Distiller(
             teacher=teacher,
             student=student,
             strategies=strategies,
             alpha=0.7,
             temperature=4.0,
-            student_loss_fn=keras.losses.SparseCategoricalCrossentropy()
+            student_loss_fn=keras.losses.SparseCategoricalCrossentropy(),
         )
-        
+
         # Build the models by calling them
         x_test = np.random.random((2, 20)).astype(np.float32)
         _ = original_distiller(x_test)
-        
+
         # Test get_config
         config = original_distiller.get_config()
-        
+
         # Verify all components are in config
         required_keys = [
-            "teacher", "student", "strategies", "student_loss_fn",
-            "alpha", "temperature", "input_mapping", "output_mapping"
+            "teacher",
+            "student",
+            "strategies",
+            "student_loss_fn",
+            "alpha",
+            "temperature",
+            "input_mapping",
+            "output_mapping",
         ]
         for key in required_keys:
             self.assertIn(key, config, f"Missing key: {key}")
-        
+
         # Test JSON serialization
         json_str = json.dumps(config)
         self.assertIsInstance(json_str, str)
-        
+
         # Test from_config reconstruction
         reconstructed_distiller = Distiller.from_config(config)
-        
+
         # Verify reconstruction
         self.assertEqual(reconstructed_distiller.alpha, 0.7)
         self.assertEqual(reconstructed_distiller.temperature, 4.0)
         self.assertEqual(len(reconstructed_distiller.strategies), 2)
-        
+
         # Verify strategy types
         self.assertIsInstance(
             reconstructed_distiller.strategies[0], LogitsDistillation
@@ -407,45 +441,45 @@ def test_distiller_serialization_and_saving(self):
         self.assertIsInstance(
             reconstructed_distiller.strategies[1], FeatureDistillation
         )
-        
+
         # Verify strategy parameters
         self.assertEqual(reconstructed_distiller.strategies[0].temperature, 3.0)
         self.assertEqual(reconstructed_distiller.strategies[1].loss_type, "mse")
-        
+
         # Test that reconstructed distiller can be used for inference
         reconstructed_output = reconstructed_distiller(x_test)
         self.assertEqual(reconstructed_output.shape, (2, 10))
-        
+
         # Test model saving and loading (full integration test)
         with tempfile.TemporaryDirectory() as temp_dir:
             model_path = os.path.join(temp_dir, "distiller_model")
-            
+
             # Compile original distiller
             original_distiller.compile(
                 optimizer=keras.optimizers.Adam(),
-                loss="sparse_categorical_crossentropy"
+                loss="sparse_categorical_crossentropy",
             )
-            
+
             # Save the model
             try:
                 original_distiller.save(model_path)
-                
+
                 # Load the model
                 loaded_distiller = keras.models.load_model(model_path)
-                
+
                 # Verify loaded model works
                 loaded_output = loaded_distiller(x_test)
                 self.assertEqual(loaded_output.shape, (2, 10))
-                
+
                 # Verify parameters are preserved
                 self.assertEqual(loaded_distiller.alpha, 0.7)
                 self.assertEqual(loaded_distiller.temperature, 4.0)
-                
+
             except Exception:
                 # Some serialization features might not be fully supported
                 # in all Keras versions, so we'll note this but not fail
                 # The important thing is that get_config/from_config works
                 pass
-        
+
         # The core serialization functionality is working
         self.assertTrue(True, "Distiller serialization test passed")

From b9547181013be6196333bc6b70a9fce094f85c09 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 17:40:40 -0700
Subject: [PATCH 09/31] disable tests in numpy and openvino backends

---
 keras/src/distillation/distiller_test.py  | 26 ++++++++++-------------
 keras/src/distillation/strategies_test.py |  5 +++++
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 4715ac399e3d..db33394bb9dd 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 import keras
 from keras.src.distillation.distiller import Distiller
@@ -32,6 +33,7 @@ def call(self, inputs, training=None):
         return self.dense2(x)
 
 
+@pytest.mark.requires_trainable_backend
 class TestDistiller(TestCase):
     """Essential test cases for the Distiller class."""
 
@@ -461,25 +463,19 @@ def test_distiller_serialization_and_saving(self):
             )
 
             # Save the model
-            try:
-                original_distiller.save(model_path)
+            original_distiller.save(model_path)
 
-                # Load the model
-                loaded_distiller = keras.models.load_model(model_path)
+            # Load the model
+            loaded_distiller = keras.models.load_model(model_path)
 
-                # Verify loaded model works
-                loaded_output = loaded_distiller(x_test)
-                self.assertEqual(loaded_output.shape, (2, 10))
+            # Verify loaded model works
+            loaded_output = loaded_distiller(x_test)
+            self.assertEqual(loaded_output.shape, (2, 10))
 
-                # Verify parameters are preserved
-                self.assertEqual(loaded_distiller.alpha, 0.7)
-                self.assertEqual(loaded_distiller.temperature, 4.0)
+            # Verify parameters are preserved
+            self.assertEqual(loaded_distiller.alpha, 0.7)
+            self.assertEqual(loaded_distiller.temperature, 4.0)
 
-            except Exception:
-                # Some serialization features might not be fully supported
-                # in all Keras versions, so we'll note this but not fail
-                # The important thing is that get_config/from_config works
-                pass
 
         # The core serialization functionality is working
         self.assertTrue(True, "Distiller serialization test passed")
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index dc7bb6859bdf..515f522eeeb3 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 import keras
 from keras import ops
@@ -40,6 +41,7 @@ def call(self, inputs, training=None):
         return [output1, output2]
 
 
+@pytest.mark.requires_trainable_backend
 class TestLogitsDistillation(TestCase):
     """Essential test cases for LogitsDistillation strategy."""
 
@@ -92,6 +94,7 @@ def test_temperature_scaling(self):
         self.assertNotEqual(losses[1], losses[2])
 
 
+@pytest.mark.requires_trainable_backend
 class TestLogitsDistillationComprehensive(TestCase):
     """Comprehensive test cases for LogitsDistillation strategy."""
 
@@ -301,6 +304,7 @@ def test_serialization(self):
         self.assertEqual(reconstructed.output_index, 1)
 
 
+@pytest.mark.requires_trainable_backend
 class TestFeatureDistillation(TestCase):
     """Test cases for FeatureDistillation strategy."""
 
@@ -599,6 +603,7 @@ def test_serialization(self):
         self.assertEqual(reconstructed.student_layer_name, "student_layer")
 
 
+@pytest.mark.requires_trainable_backend
 class TestMultiOutputDistillation(TestCase):
     """Comprehensive test cases for MultiOutputDistillation strategy."""
 

From bf6219a52667539fa7f23087a595c9b958816824 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 11 Aug 2025 17:41:14 -0700
Subject: [PATCH 10/31] pre commit

---
 keras/src/distillation/distiller_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index db33394bb9dd..074e6e8cff72 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -476,6 +476,5 @@ def test_distiller_serialization_and_saving(self):
             self.assertEqual(loaded_distiller.alpha, 0.7)
             self.assertEqual(loaded_distiller.temperature, 4.0)
 
-
         # The core serialization functionality is working
         self.assertTrue(True, "Distiller serialization test passed")

From b7e51a92d932719dc227cbf96e614b3f0b32d8df Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Fri, 15 Aug 2025 15:17:52 -0700
Subject: [PATCH 11/31] address comments

---
 keras/src/distillation/distiller.py       | 290 +++++++++-------------
 keras/src/distillation/distiller_test.py  | 136 ++++------
 keras/src/distillation/strategies.py      | 249 ++++++++++++++++---
 keras/src/distillation/strategies_test.py |  43 ++--
 4 files changed, 399 insertions(+), 319 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index b27d33034153..4d29fb3267ff 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -5,167 +5,120 @@
 
 @keras_export("keras.distillation.Distiller")
 class Distiller(Model):
-    """Knowledge Distillation model.
+    """Knowledge Distillation model for transferring knowledge from teacher to student.
 
-    This class implements knowledge distillation by combining a teacher model
-    and a student model with configurable distillation strategies.
+    Knowledge distillation transfers knowledge from a large, complex model (teacher)
+    to a smaller, simpler model (student). The student learns from both ground truth
+    labels and the teacher's predictions, often achieving better performance than
+    training on labels alone.
 
-    The Distiller integrates seamlessly with Keras's training infrastructure
-    by overriding the _compute_loss method, allowing standard model.fit(),
-    model.evaluate(), and model.predict() workflows to work correctly.
+    How Knowledge Distillation Works:
 
-    Args:
-        teacher: The teacher model (will be frozen during training).
-        student: The student model to be trained.
-        strategies: List of distillation strategies to apply.
-        student_loss_fn: Loss function for student predictions. Defaults to
-            sparse categorical crossentropy.
-        alpha: Weight for combining student loss and distillation loss.
-            alpha=1.0 means only student loss, alpha=0.0 means only
-            distillation loss.
-        temperature: Default temperature for distillation strategies that don't
-            specify their own temperature. Used for softmax temperature scaling
-            in knowledge distillation. Defaults to 3.0.
-        name: Name of the distiller model.
-
-    Examples:
-
-    **Basic Knowledge Distillation:**
+    1. Teacher Model: A pre-trained, larger model that has learned complex patterns
+       and relationships in the data. The teacher is frozen during distillation.
 
-    ```python
-    import keras
-    import numpy as np
-    from keras.distillation import Distiller, LogitsDistillation
+    2. Student Model: A smaller, simpler model that we want to train to mimic
+       the teacher's behavior while being more efficient for deployment.
 
-    # Create teacher and student models
-    teacher = keras.Sequential([
-        keras.layers.Dense(128, activation='relu'),
-        keras.layers.Dense(10, activation='softmax')
-    ])
+    3. Distillation Process: The student learns from two sources:
+       - Hard targets: Traditional supervised learning with ground truth labels
+       - Soft targets: The teacher's predictions, which contain rich information
+         about class relationships and confidence levels
 
-    student = keras.Sequential([
-        keras.layers.Dense(32, activation='relu'),
-        keras.layers.Dense(10, activation='softmax')
-    ])
+    4. Temperature Scaling: The teacher's logits are divided by a temperature
+       parameter before applying softmax, creating "softer" probability distributions
+       that are easier for the student to learn from.
 
-    # Create distillation strategy (will use Distiller's default temperature)
-    strategy = LogitsDistillation()
+    When to Use Knowledge Distillation:
 
-    # Create distiller with default temperature
-    distiller = Distiller(
-        teacher=teacher,
-        student=student,
-        strategies=[strategy],
-        alpha=0.7,  # 70% student loss, 30% distillation loss
-        temperature=4.0  # Default temperature for all strategies
-    )
+    - Model Compression: Reduce model size for deployment on resource-constrained devices
+    - Performance Improvement: Student models often outperform models trained only on labels
+    - Transfer Learning: Leverage knowledge from large pre-trained models
+    - Ensemble Distillation: Combine multiple teacher models into a single student
 
-    # Compile and train
-    distiller.compile(
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy'
-    )
+    Strategy Selection Guide:
 
-    # Generate dummy data
-    x_train = np.random.random((1000, 20))
-    y_train = np.random.randint(0, 10, (1000,))
+    - LogitsDistillation: Most common approach. Transfers final output knowledge.
+      Best for classification tasks where you want the student to learn the teacher's
+      decision boundaries and confidence patterns.
 
-    # Train the distiller
-    distiller.fit(x_train, y_train, epochs=10, batch_size=32)
+    - FeatureDistillation: Transfers intermediate representations. Best when teacher
+      and student have similar architectures, as it helps the student learn better
+      internal representations. Often leads to better performance than logits-only.
 
-    # Use the trained student model
-    predictions = distiller.predict(x_train[:5])
-    ```
+    - MultiOutputDistillation: For complex models with multiple outputs (e.g.,
+      object detection with classification and regression heads). Allows different
+      distillation strategies for different outputs.
 
-    **Multi-Strategy Distillation:**
+    Args:
+        teacher: A trained keras.Model that serves as the knowledge source.
+            The teacher model is frozen during distillation.
+        student: A keras.Model to be trained through distillation. This model
+            will learn from both ground truth labels and the teacher's predictions.
+        strategy: Distillation strategy or list of strategies. Can be a single
+            strategy (e.g., LogitsDistillation) or a list of strategies for
+            multi-strategy distillation.
+        student_loss_weight: Weight for the student's supervised loss component.
+            Must be between 0 and 1. Higher values emphasize ground truth labels,
+            lower values emphasize teacher predictions. Defaults to 0.5.
+        optimizer: Optimizer for training the student model. Can be a string
+            identifier (e.g., 'adam') or an optimizer instance.
+        student_loss: Loss function for the student's supervised learning component.
+            Can be a string identifier or a loss function instance.
+        metrics: List of metrics to track during training.
+        name: Name for the distiller model. Defaults to "distiller".
+        **kwargs: Additional keyword arguments passed to the parent Model class.
+
+    Example:
 
     ```python
-    from keras.distillation import (
-        Distiller, LogitsDistillation, FeatureDistillation
-    )
-
-    # Multiple distillation strategies
-    strategies = [
-        LogitsDistillation(),  # Will use Distiller's default temperature
-        LogitsDistillation(temperature=2.0),  # Override with specific temp
-        FeatureDistillation(
-            loss_type="mse",
-            teacher_layer_name="dense_1",
-            student_layer_name="dense_1"
-        )
-    ]
-
+    # Load pre-trained teacher model from KerasHub
+    import keras_hub as hub
+    
+    teacher = hub.models.CausalLM.from_preset("gemma3_4b_en")
+    student = hub.models.CausalLM.from_preset("gemma2_2b_en")
+    
+    # Create distillation strategy
+    strategy = LogitsDistillation(temperature=3.0)
+
+    # Create distiller
     distiller = Distiller(
         teacher=teacher,
         student=student,
-        strategies=strategies,
-        alpha=0.5,
-        temperature=4.0  # Default temperature for strategies without one
+        strategy=strategy,
+        student_loss_weight=0.7,
+        optimizer='adam',
+        student_loss='sparse_categorical_crossentropy',
+        metrics=['accuracy']
     )
+
+    # Train the distiller
+    distiller.fit(x_train, y_train, epochs=10, validation_split=0.2)
+
+    # Get the trained student model
+    trained_student = distiller.get_student_model()
     ```
 
-    **Multi-Output Model Distillation:**
+    For multi-output models:
 
     ```python
-    from keras.distillation import MultiOutputDistillation
-
-    # For models with multiple outputs
+    # Create multi-output strategy
     multi_strategy = MultiOutputDistillation(
         output_strategies={
-            0: LogitsDistillation(output_index=0),  # Uses default temperature
-            1: LogitsDistillation(
-                temperature=2.0, output_index=1
-            )  # Override temperature
+            0: LogitsDistillation(temperature=3.0, output_index=0),  # Classification
+            1: LogitsDistillation(temperature=2.0, output_index=1)   # Regression
         },
-        weights={0: 1.0, 1: 0.5}
-    )
-
-    distiller = Distiller(
-        teacher=multi_output_teacher,
-        student=multi_output_student,
-        strategies=[multi_strategy],
-        alpha=0.6,
-        temperature=3.0  # Default temperature
+        weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
     )
-    ```
 
-    **Custom Loss Function:**
-
-    ```python
-    # Using custom student loss function
     distiller = Distiller(
         teacher=teacher,
         student=student,
-        strategies=[LogitsDistillation()],  # Uses default temperature
-        student_loss_fn=keras.losses.CategoricalCrossentropy(),
-        alpha=0.8,
-        temperature=5.0
-    )
-    ```
-
-    **Accessing and Saving the Trained Student Model:**
-
-    ```python
-    # After training
-    distiller.fit(x_train, y_train, epochs=10)
-
-    # Method 1: Direct access
-    trained_student = distiller.student
-
-    # Method 2: Using convenience method (recommended)
-    trained_student = distiller.get_student_model()
-
-    # Save the student model independently
-    trained_student.save('trained_student.keras')
-
-    # Use student model for inference
-    predictions = trained_student.predict(x_test)
-
-    # Further train the student model independently
-    trained_student.compile(
-        optimizer='adam', loss='sparse_categorical_crossentropy'
+        strategy=multi_strategy,
+        student_loss_weight=0.5,
+        optimizer='adam',
+        student_loss=['sparse_categorical_crossentropy', 'mse']
     )
-    trained_student.fit(x_new, y_new, epochs=5)
     ```
     """
 
@@ -173,10 +126,11 @@ def __init__(
         self,
         teacher,
         student,
-        strategies,
-        student_loss_fn=None,
-        alpha=0.5,
-        temperature=3.0,
+        strategy,
+        student_loss_weight=0.5,
+        optimizer="adam",
+        student_loss="sparse_categorical_crossentropy",
+        metrics=None,
         name="distiller",
         **kwargs,
     ):
@@ -192,20 +146,13 @@ def __init__(
         # Store configuration
         self.teacher = teacher
         self.student = student
-        self.strategies = (
-            strategies if isinstance(strategies, list) else [strategies]
-        )
-        self.alpha = alpha
-        self.temperature = temperature
-
-        # Apply default temperature to strategies that don't have one
-        self._apply_default_temperature()
+        self.student_loss_weight = student_loss_weight
 
-        # Set up student loss function
-        if student_loss_fn is None:
-            self.student_loss_fn = keras.losses.SparseCategoricalCrossentropy()
+        # Handle strategy input - can be single strategy or list
+        if isinstance(strategy, list):
+            self.strategies = strategy
         else:
-            self.student_loss_fn = student_loss_fn
+            self.strategies = [strategy]
 
         # Freeze teacher model
         self.teacher.trainable = False
@@ -217,21 +164,10 @@ def __init__(
         )
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
-    def _apply_default_temperature(self):
-        """Apply default temperature to strategies that support it."""
-        from keras.src.distillation.strategies import LogitsDistillation
-
-        for strategy in self.strategies:
-            if isinstance(strategy, LogitsDistillation):
-                # Use the new method to set default temperature
-                strategy.set_default_temperature(self.temperature)
-            # Handle nested strategies in MultiOutputDistillation
-            elif hasattr(strategy, "output_strategies"):
-                for nested_strategy in strategy.output_strategies.values():
-                    if isinstance(nested_strategy, LogitsDistillation):
-                        nested_strategy.set_default_temperature(
-                            self.temperature
-                        )
+        # Compile the model with provided parameters
+        self.compile(
+            optimizer=optimizer, loss=student_loss, metrics=metrics or []
+        )
 
     def _validate_models(self, teacher, student):
         """Validate that teacher and student are Keras models."""
@@ -324,8 +260,8 @@ def _compute_loss(
 
         # Compute student loss
         student_loss = 0.0
-        if self.alpha > 0.0 and y is not None:
-            # Try using compiled_loss first, fallback to student_loss_fn
+        if self.student_loss_weight > 0.0 and y is not None:
+            # Try using compiled_loss first, fallback to default loss
             if (
                 hasattr(self, "compiled_loss")
                 and self.compiled_loss is not None
@@ -337,16 +273,20 @@ def _compute_loss(
                     regularization_losses=[],
                 )
             else:
-                # Fallback: use student_loss_fn directly
+                # Fallback: use default loss function
                 if isinstance(y_pred, list) and len(y_pred) > 0:
                     # For multi-output, use first output for student loss
-                    student_loss = self.student_loss_fn(y[0], y_pred[0])
+                    student_loss = keras.losses.sparse_categorical_crossentropy(
+                        y[0], y_pred[0]
+                    )
                 else:
-                    student_loss = self.student_loss_fn(y, y_pred)
+                    student_loss = keras.losses.sparse_categorical_crossentropy(
+                        y, y_pred
+                    )
 
         # Compute distillation loss
         distillation_loss = 0.0
-        if self.alpha < 1.0:
+        if self.student_loss_weight < 1.0:
             for strategy in self.strategies:
                 # Get appropriate outputs for this strategy
                 teacher_outputs, student_outputs = self._get_strategy_outputs(
@@ -362,7 +302,8 @@ def _compute_loss(
 
         # Combine losses
         total_loss = (
-            self.alpha * student_loss + (1.0 - self.alpha) * distillation_loss
+            self.student_loss_weight * student_loss
+            + (1.0 - self.student_loss_weight) * distillation_loss
         )
 
         # Update metrics
@@ -401,15 +342,11 @@ def get_config(self):
                 "student": serialization_lib.serialize_keras_object(
                     self.student
                 ),
-                "strategies": [
+                "strategy": [
                     serialization_lib.serialize_keras_object(s)
                     for s in self.strategies
                 ],
-                "student_loss_fn": serialization_lib.serialize_keras_object(
-                    self.student_loss_fn
-                ),
-                "alpha": self.alpha,
-                "temperature": self.temperature,
+                "student_loss_weight": self.student_loss_weight,
                 "input_mapping": self.input_mapping,
                 "output_mapping": self.output_mapping,
             }
@@ -427,11 +364,8 @@ def from_config(cls, config):
         config["student"] = serialization_lib.deserialize_keras_object(
             config["student"]
         )
-        config["strategies"] = [
+        config["strategy"] = [
             serialization_lib.deserialize_keras_object(s)
-            for s in config["strategies"]
+            for s in config["strategy"]
         ]
-        config["student_loss_fn"] = serialization_lib.deserialize_keras_object(
-            config["student_loss_fn"]
-        )
         return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 074e6e8cff72..8498c138ea2b 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -45,23 +45,18 @@ def setUp(self):
         self.teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         self.student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
-        # Create distillation strategy
-        self.strategy = LogitsDistillation()
+        # Create distillation strategy with explicit temperature
+        self.strategy = LogitsDistillation(temperature=2.0)
 
         # Create distiller
         self.distiller = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategies=[self.strategy],
-            alpha=0.5,
-            temperature=2.0,
-        )
-
-        # Compile distiller (avoid additional metrics for JAX sharding issues)
-        self.distiller.compile(
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss="sparse_categorical_crossentropy",
-            steps_per_execution=1,
+            strategy=self.strategy,
+            student_loss_weight=0.5,
+            optimizer="adam",
+            student_loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Create test data
@@ -76,17 +71,20 @@ def test_distiller_initialization(self):
         # Check that student is trainable
         self.assertTrue(self.student.trainable)
 
-        # Check alpha and temperature
-        self.assertEqual(self.distiller.alpha, 0.5)
-        self.assertEqual(self.distiller.temperature, 2.0)
+        # Check student_loss_weight
+        self.assertEqual(self.distiller.student_loss_weight, 0.5)
 
         # Check strategies
         self.assertLen(self.distiller.strategies, 1)
         self.assertIsInstance(self.distiller.strategies[0], LogitsDistillation)
 
-        # Check that strategy received the default temperature
+        # Check that strategy has the correct temperature
         self.assertEqual(self.distiller.strategies[0].temperature, 2.0)
 
+        # Check that model is compiled
+        self.assertIsNotNone(self.distiller.optimizer)
+        self.assertIsNotNone(self.distiller.compiled_loss)
+
     def test_distiller_call(self):
         """Test Distiller call method (inference)."""
         # Call should return student outputs
@@ -116,9 +114,10 @@ def test_teacher_freezing(self):
         Distiller(
             teacher=new_teacher,
             student=self.student,
-            strategies=[self.strategy],
-            alpha=0.5,
-            temperature=2.0,
+            strategy=self.strategy,
+            student_loss_weight=0.5,
+            optimizer=keras.optimizers.Adam(),
+            student_loss="sparse_categorical_crossentropy",
         )
 
         # Teacher should now be frozen
@@ -131,44 +130,36 @@ def test_model_compatibility_validation(self):
             Distiller(
                 teacher="not_a_model",
                 student=self.student,
-                strategies=[self.strategy],
+                strategy=self.strategy,
             )
 
         with self.assertRaises(ValueError):
             Distiller(
                 teacher=self.teacher,
                 student="not_a_model",
-                strategies=[self.strategy],
+                strategy=self.strategy,
             )
 
-    def test_alpha_weighting(self):
-        """Test that alpha correctly weights student vs distillation loss."""
-        # Test with alpha = 0.0 (only distillation loss)
+    def test_student_loss_weighting(self):
+        """Test that student_loss_weight correctly weights student vs distillation loss."""
+        # Test with student_loss_weight = 0.0 (only distillation loss)
         distiller_0 = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategies=[self.strategy],
-            alpha=0.0,
-            temperature=2.0,
-        )
-        distiller_0.compile(
+            strategy=self.strategy,
+            student_loss_weight=0.0,
             optimizer=keras.optimizers.Adam(),
-            loss="sparse_categorical_crossentropy",
-            steps_per_execution=1,
+            student_loss="sparse_categorical_crossentropy",
         )
 
-        # Test with alpha = 1.0 (only student loss)
+        # Test with student_loss_weight = 1.0 (only student loss)
         distiller_1 = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategies=[self.strategy],
-            alpha=1.0,
-            temperature=2.0,
-        )
-        distiller_1.compile(
+            strategy=self.strategy,
+            student_loss_weight=1.0,
             optimizer=keras.optimizers.Adam(),
-            loss="sparse_categorical_crossentropy",
-            steps_per_execution=1,
+            student_loss="sparse_categorical_crossentropy",
         )
 
         # Test that they can be used for training without errors
@@ -200,16 +191,11 @@ def test_full_training_workflow(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategies=[LogitsDistillation(temperature=2.0)],
-            alpha=0.5,
-            temperature=2.0,
-        )
-
-        # Compile (avoid additional metrics to prevent JAX sharding issues)
-        distiller.compile(
+            strategy=self.strategy,
+            student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss="sparse_categorical_crossentropy",
-            steps_per_execution=1,
+            student_loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Train the model
@@ -267,19 +253,14 @@ def test_evaluation_workflow(self):
         teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
-        # Create and compile distiller
+        # Create distiller
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategies=[LogitsDistillation(temperature=2.0)],
-            alpha=0.5,
-            temperature=2.0,
-        )
-
-        distiller.compile(
+            strategy=self.strategy,
+            student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss="sparse_categorical_crossentropy",
-            steps_per_execution=1,
+            student_loss="sparse_categorical_crossentropy",
         )
 
         # Train briefly
@@ -306,19 +287,14 @@ def test_prediction_workflow(self):
         teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
-        # Create and compile distiller
+        # Create distiller
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategies=[LogitsDistillation(temperature=2.0)],
-            alpha=0.5,
-            temperature=2.0,
-        )
-
-        distiller.compile(
+            strategy=self.strategy,
+            student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss="sparse_categorical_crossentropy",
-            steps_per_execution=1,
+            student_loss="sparse_categorical_crossentropy",
         )
 
         # Make predictions
@@ -339,8 +315,10 @@ def test_get_student_model_method(self):
         distiller = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategies=[LogitsDistillation()],
-            alpha=0.5,
+            strategy=self.strategy,
+            student_loss_weight=0.5,
+            optimizer=keras.optimizers.Adam(),
+            student_loss="sparse_categorical_crossentropy",
         )
 
         # Test that get_student_model returns the same as direct access
@@ -397,10 +375,10 @@ def test_distiller_serialization_and_saving(self):
         original_distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategies=strategies,
-            alpha=0.7,
-            temperature=4.0,
-            student_loss_fn=keras.losses.SparseCategoricalCrossentropy(),
+            strategy=strategies,
+            student_loss_weight=0.7,
+            optimizer=keras.optimizers.Adam(),
+            student_loss="sparse_categorical_crossentropy",
         )
 
         # Build the models by calling them
@@ -414,10 +392,8 @@ def test_distiller_serialization_and_saving(self):
         required_keys = [
             "teacher",
             "student",
-            "strategies",
-            "student_loss_fn",
-            "alpha",
-            "temperature",
+            "strategy",
+            "student_loss_weight",
             "input_mapping",
             "output_mapping",
         ]
@@ -432,8 +408,7 @@ def test_distiller_serialization_and_saving(self):
         reconstructed_distiller = Distiller.from_config(config)
 
         # Verify reconstruction
-        self.assertEqual(reconstructed_distiller.alpha, 0.7)
-        self.assertEqual(reconstructed_distiller.temperature, 4.0)
+        self.assertEqual(reconstructed_distiller.student_loss_weight, 0.7)
         self.assertEqual(len(reconstructed_distiller.strategies), 2)
 
         # Verify strategy types
@@ -454,7 +429,7 @@ def test_distiller_serialization_and_saving(self):
 
         # Test model saving and loading (full integration test)
         with tempfile.TemporaryDirectory() as temp_dir:
-            model_path = os.path.join(temp_dir, "distiller_model")
+            model_path = os.path.join(temp_dir, "distiller_model.keras")
 
             # Compile original distiller
             original_distiller.compile(
@@ -473,8 +448,7 @@ def test_distiller_serialization_and_saving(self):
             self.assertEqual(loaded_output.shape, (2, 10))
 
             # Verify parameters are preserved
-            self.assertEqual(loaded_distiller.alpha, 0.7)
-            self.assertEqual(loaded_distiller.temperature, 4.0)
+            self.assertEqual(loaded_distiller.student_loss_weight, 0.7)
 
         # The core serialization functionality is working
         self.assertTrue(True, "Distiller serialization test passed")
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 95695e62d841..3541767b2f97 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -7,13 +7,20 @@ class BaseDistillationStrategy:
     """Base class for distillation strategies.
 
     Distillation strategies define how to compute the distillation loss
-    between teacher and student outputs.
+    between teacher and student outputs. Each strategy implements a specific
+    approach to knowledge transfer, from simple logits matching to complex
+    multi-output distillation.
+
     To create custom distillation strategies, subclass this class and
     override the compute_loss method.
     """
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute distillation loss between teacher and student outputs.
+
+        This method should implement the specific distillation logic for
+        transferring knowledge from teacher to student.
+
         Args:
             teacher_outputs: Outputs from the teacher model. Can be a single
                 tensor or a list/tuple of tensors for multi-output models.
@@ -28,6 +35,10 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that teacher and student outputs are compatible.
 
+        This method ensures that the outputs from teacher and student models
+        are compatible for the specific distillation strategy. It should check
+        shapes, dimensions, and other requirements.
+
         Args:
             teacher_outputs: Outputs from the teacher model.
             student_outputs: Outputs from the student model.
@@ -50,29 +61,69 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
 @keras_export("keras.distillation.LogitsDistillation")
 class LogitsDistillation(BaseDistillationStrategy):
-    """Logits distillation strategy using Keras built-in loss functions.
+    """Distillation strategy that transfers knowledge from final model outputs (logits).
+
+    This strategy applies temperature scaling to the teacher's logits before computing
+    the loss between teacher and student predictions. It's the most common approach
+    for knowledge distillation.
+
+    How Logits Distillation Works:
+
+    1. Temperature Scaling: The teacher's logits are divided by a temperature
+       parameter (typically 3-5) before applying softmax. This creates "softer"
+       probability distributions that reveal relationships between classes.
+
+    2. Loss Computation: The loss is computed between the temperature-scaled
+       teacher logits and student logits using either KL divergence or categorical
+       crossentropy.
+
+    When to Use Logits Distillation:
+
+    - General Classification: Works well for most classification tasks
+    - Model Compression: Effective for reducing model size while maintaining accuracy
+    - Transfer Learning: Good for leveraging knowledge from pre-trained models
+    - Ensemble Distillation: Can combine multiple teacher models
 
-    This strategy distills knowledge using the logits (pre-softmax outputs)
-    from teacher and student models.
+    Temperature Guidelines:
+
+    - Low Temperature (1-2): Sharp distributions, similar to hard labels
+    - Medium Temperature (3-5): Balanced softness, most commonly used
+    - High Temperature (6-10): Very soft distributions, reveals subtle relationships
 
     Args:
         temperature: Temperature for softmax scaling. Higher values produce
-            softer probability distributions. If None, will use the default
-            temperature from the Distiller. Defaults to None.
+            softer probability distributions that are easier for the student to learn.
+            Typical values range from 3-5. Defaults to 3.0.
         loss_type: Type of loss function to use. Options:
-            - "kl_divergence": KL divergence using keras.losses.kl_divergence
-            - "categorical_crossentropy": Categorical crossentropy using
-              keras.losses.categorical_crossentropy
+            - "kl_divergence": KL divergence between teacher and student distributions
+            - "categorical_crossentropy": Crossentropy with teacher as target
         output_index: Index of the output to use for multi-output models.
             Defaults to 0.
+
+    Example:
+
+    ```python
+    # Basic logits distillation
+    strategy = LogitsDistillation(temperature=3.0)
+
+    # With categorical crossentropy loss
+    strategy = LogitsDistillation(
+        temperature=4.0,
+        loss_type="categorical_crossentropy"
+    )
+
+    # For multi-output models
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        output_index=1  # Use second output
+    )
+    ```
     """
 
     def __init__(
-        self, temperature=None, loss_type="kl_divergence", output_index=0
+        self, temperature=3.0, loss_type="kl_divergence", output_index=0
     ):
-        # If no temperature provided, use sentinel value for Distiller detection
-        self.temperature = temperature if temperature is not None else 3.0
-        self._temperature_explicitly_set = temperature is not None
+        self.temperature = temperature
         self.loss_type = loss_type
         self.output_index = output_index
 
@@ -81,11 +132,6 @@ def __init__(
         if loss_type not in valid_loss_types:
             raise ValueError(f"loss_type must be one of {valid_loss_types}")
 
-    def set_default_temperature(self, default_temperature):
-        """Set the default temperature if none was explicitly provided."""
-        if not self._temperature_explicitly_set:
-            self.temperature = default_temperature
-
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
@@ -191,24 +237,89 @@ def from_config(cls, config):
 
 @keras_export("keras.distillation.FeatureDistillation")
 class FeatureDistillation(BaseDistillationStrategy):
-    """Feature distillation strategy using intermediate layer features.
+    """Feature distillation strategy using intermediate layer representations.
+
+    Feature distillation transfers knowledge from intermediate layers of the
+    teacher model to corresponding layers of the student model. This approach
+    helps the student learn better internal representations and often leads
+    to superior performance compared to logits-only distillation.
+
+    How Feature Distillation Works:
+
+    1. Layer Selection: Specify which intermediate layers from teacher and
+       student models to use for distillation. These layers should have
+       compatible architectures or similar semantic meaning.
 
-    This strategy distills intermediate features from teacher to student,
-    not just the final outputs. It creates feature extraction models
-    to extract outputs from specified intermediate layers.
+    2. Feature Extraction: Extract activations from the specified layers
+       during forward pass. The teacher features are computed with training=False
+       (frozen), while student features are computed with training=True.
 
-    Note: If teacher and student features have different shapes, you may need
-    to add alignment layers or use models with compatible intermediate
-    feature dimensions.
+    3. Loss Computation: Compute loss between teacher and student features
+       using either MSE (for identical shapes) or cosine similarity (for
+       different shapes).
+
+    When to Use Feature Distillation:
+
+    - Similar Architectures: When teacher and student have similar layer
+      structures (e.g., both are CNNs with similar depths)
+    - Performance Improvement: Often leads to better student performance
+      than logits-only distillation
+    - Representation Learning: Helps student learn better internal features
+    - Multi-Scale Distillation: Can distill features from multiple layers
+      simultaneously
+
+    Layer Selection Guidelines:
+
+    - Early Layers: Capture low-level features (edges, textures)
+    - Middle Layers: Capture mid-level features (shapes, patterns)
+    - Late Layers: Capture high-level features (semantic concepts)
+    - Compatible Sizes: Choose layers with similar output dimensions
+    - Semantic Alignment: Match layers that serve similar functions
+
+    Loss Type Selection:
+
+    - MSE: Use when teacher and student features have identical shapes.
+      Provides direct feature matching.
+    - Cosine Similarity: Use when features have different shapes but
+      same feature dimension (last axis). Focuses on feature direction
+      rather than magnitude.
 
     Args:
         loss_type: Type of loss function to use. Options:
-            - "mse": Mean squared error using keras.losses.mean_squared_error
-            - "cosine": Cosine similarity using keras.losses.cosine_similarity
+            - "mse": Mean squared error between teacher and student features
+            - "cosine": Cosine similarity between feature vectors
         teacher_layer_name: Name of the teacher layer to extract features from.
             If None, uses the final output. Defaults to None.
         student_layer_name: Name of the student layer to extract features from.
             If None, uses the final output. Defaults to None.
+
+    Examples:
+
+    ```python
+    # Basic feature distillation from final outputs
+    strategy = FeatureDistillation(loss_type="mse")
+
+    # Distill from specific layers with compatible shapes
+    strategy = FeatureDistillation(
+        loss_type="mse",
+        teacher_layer_name="dense_1",
+        student_layer_name="dense_1"
+    )
+
+    # Use cosine similarity for different feature sizes
+    strategy = FeatureDistillation(
+        loss_type="cosine",
+        teacher_layer_name="conv2d_2",
+        student_layer_name="conv2d_1"
+    )
+
+    # Distill from final outputs (equivalent to logits distillation)
+    strategy = FeatureDistillation(
+        loss_type="mse",
+        teacher_layer_name=None,  # Final output
+        student_layer_name=None   # Final output
+    )
+    ```
     """
 
     def __init__(
@@ -457,18 +568,90 @@ def from_config(cls, config):
 
 @keras_export("keras.distillation.MultiOutputDistillation")
 class MultiOutputDistillation(BaseDistillationStrategy):
-    """Multi-output distillation strategy.
+    """Multi-output distillation strategy for complex models.
+
+    Multi-output distillation handles models with multiple outputs, such as
+    object detection models (classification + regression), multi-task learning
+    models, or any model with multiple prediction heads. This strategy allows
+    different distillation approaches for different outputs.
+
+    How Multi-Output Distillation Works:
+
+    1. Output Mapping: Map each output index to a specific distillation
+       strategy. Different outputs can use different strategies based on their
+       nature (classification vs regression, different loss functions, etc.).
 
-    Multi-output distillation strategy applies distillation to multiple
-    outputs. This strategy allows different distillation strategies to be
-    applied to different outputs of multi-output models.
+    2. Strategy Application: Apply the appropriate strategy to each output
+       pair (teacher output i → student output i).
+
+    3. Loss Combination: Combine the losses from all outputs using
+       configurable weights. This allows prioritizing certain outputs over others.
+
+    When to Use Multi-Output Distillation:
+
+    - Object Detection: Models with classification and bounding box regression
+    - Multi-Task Learning: Models that predict multiple related tasks
+    - Complex Architectures: Models with multiple prediction heads
+    - Different Output Types: When outputs have different characteristics
+      (e.g., categorical vs continuous)
+
+    Output Strategy Selection:
+
+    - Classification Outputs: Use LogitsDistillation with appropriate temperature
+    - Regression Outputs: Use LogitsDistillation with lower temperature or
+      FeatureDistillation with MSE loss
+    - Feature Outputs: Use FeatureDistillation to transfer intermediate representations
+    - Mixed Types: Combine different strategies for different outputs
+
+    Weight Configuration:
+
+    - Equal Weights: All outputs contribute equally to the total loss
+    - Task-Specific Weights: Weight outputs based on task importance
+    - Loss-Scale Weights: Adjust weights to balance different loss scales
+    - Performance-Based: Weight outputs based on their impact on final performance
 
     Args:
         output_strategies: Dict mapping output indices to distillation
-            strategies.
-            Each strategy will be applied to the corresponding output.
+            strategies. Each strategy will be applied to the corresponding output.
+            Example: {0: LogitsDistillation(), 1: FeatureDistillation()}
         weights: Dict mapping output indices to weights for combining losses.
             If None, all outputs are weighted equally. Defaults to None.
+            Example: {0: 1.0, 1: 0.5}  # First output twice as important
+
+    Examples:
+
+    ```python
+    # Object detection distillation (classification + regression)
+    strategy = MultiOutputDistillation(
+        output_strategies={
+            0: LogitsDistillation(temperature=3.0, output_index=0),  # Classification
+            1: LogitsDistillation(temperature=1.0, output_index=1)   # Regression
+        },
+        weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
+    )
+
+    # Multi-task learning with different strategies
+    strategy = MultiOutputDistillation(
+        output_strategies={
+            0: LogitsDistillation(temperature=4.0, output_index=0),  # Task 1
+            1: FeatureDistillation(
+                loss_type="mse",
+                teacher_layer_name="dense_1",
+                student_layer_name="dense_1"
+            )  # Task 2
+        }
+    )
+
+    # Equal weighting for all outputs
+    strategy = MultiOutputDistillation(
+        output_strategies={
+            0: LogitsDistillation(temperature=3.0, output_index=0),
+            1: LogitsDistillation(temperature=3.0, output_index=1),
+            2: LogitsDistillation(temperature=3.0, output_index=2)
+        }
+        # weights=None (defaults to equal weights)
+    )
+    ```
     """
 
     def __init__(self, output_strategies, weights=None):
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 515f522eeeb3..95054d7f4c5a 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -110,7 +110,6 @@ def test_initialization(self):
         self.assertEqual(strategy.temperature, 3.0)  # Default fallback
         self.assertEqual(strategy.loss_type, "kl_divergence")
         self.assertEqual(strategy.output_index, 0)
-        self.assertFalse(strategy._temperature_explicitly_set)
 
         # Test custom initialization
         strategy = LogitsDistillation(
@@ -121,32 +120,21 @@ def test_initialization(self):
         self.assertEqual(strategy.temperature, 5.0)
         self.assertEqual(strategy.loss_type, "categorical_crossentropy")
         self.assertEqual(strategy.output_index, 1)
-        self.assertTrue(strategy._temperature_explicitly_set)
 
     def test_invalid_loss_type(self):
         """Test that invalid loss types raise ValueError."""
         with self.assertRaises(ValueError):
             LogitsDistillation(loss_type="invalid_loss")
 
-    def test_default_temperature_mechanism(self):
-        """Test that default temperature can be set from Distiller."""
-        # Create strategy without explicit temperature
-        strategy = LogitsDistillation()
-        self.assertEqual(strategy.temperature, 3.0)
-        self.assertFalse(strategy._temperature_explicitly_set)
-
-        # Set default temperature
-        strategy.set_default_temperature(4.0)
-        self.assertEqual(strategy.temperature, 4.0)
-
+    def test_temperature_configuration(self):
+        """Test that temperature is properly configured."""
         # Create strategy with explicit temperature
-        strategy_explicit = LogitsDistillation(temperature=2.0)
-        self.assertEqual(strategy_explicit.temperature, 2.0)
-        self.assertTrue(strategy_explicit._temperature_explicitly_set)
+        strategy = LogitsDistillation(temperature=4.0)
+        self.assertEqual(strategy.temperature, 4.0)
 
-        # Try to set default - should not change
-        strategy_explicit.set_default_temperature(4.0)
-        self.assertEqual(strategy_explicit.temperature, 2.0)  # Unchanged
+        # Create strategy with default temperature
+        strategy_default = LogitsDistillation()
+        self.assertEqual(strategy_default.temperature, 3.0)
 
     def test_logits_distillation_loss_kl_divergence(self):
         """Test logits distillation loss computation with KL divergence."""
@@ -773,20 +761,21 @@ def test_end_to_end_with_multi_output_models(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategies=[multi_strategy],
-            alpha=0.5,
-            temperature=2.0,
-        )
-
-        distiller.compile(
+            strategy=[multi_strategy],
+            student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            loss=[
+            student_loss=[
                 "sparse_categorical_crossentropy",
                 "sparse_categorical_crossentropy",
             ],
-            steps_per_execution=1,
+            metrics=[
+                ["accuracy"],  # Metrics for output 0
+                ["accuracy"]   # Metrics for output 1
+            ]
         )
 
+
+
         # Create test data for multi-output model
         x = np.random.random((20, 5)).astype(np.float32)
         # Multi-output targets: [output1_targets, output2_targets]

From e8229c23b01cfae78a56eead64b4a0526e217f27 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Fri, 15 Aug 2025 16:43:46 -0700
Subject: [PATCH 12/31] address comments

---
 keras/src/distillation/distiller.py       | 155 +++++++++++++---------
 keras/src/distillation/strategies.py      | 136 +++++++++++++------
 keras/src/distillation/strategies_test.py |  62 +++------
 3 files changed, 210 insertions(+), 143 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 4d29fb3267ff..ef8b90d4c3fd 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -7,8 +7,8 @@
 class Distiller(Model):
     """Knowledge Distillation model for transferring knowledge from teacher to student.
 
-    Knowledge distillation transfers knowledge from a large, complex model (teacher)
-    to a smaller, simpler model (student). The student learns from both ground truth
+    Knowledge distillation transfers knowledge from a large, complex model (`teacher`)
+    to a smaller, simpler model (`student`). The student learns from both ground truth
     labels and the teacher's predictions, often achieving better performance than
     training on labels alone.
 
@@ -22,10 +22,10 @@ class Distiller(Model):
 
     3. Distillation Process: The student learns from two sources:
        - Hard targets: Traditional supervised learning with ground truth labels
-       - Soft targets: The teacher's predictions, which contain rich information
+       - Soft targets: The teacher's predictions, which contain information
          about class relationships and confidence levels
 
-    4. Temperature Scaling: The teacher's logits are divided by a temperature
+    4. Temperature Scaling: The teacher's logits are divided by a `temperature`
        parameter before applying softmax, creating "softer" probability distributions
        that are easier for the student to learn from.
 
@@ -38,36 +38,36 @@ class Distiller(Model):
 
     Strategy Selection Guide:
 
-    - LogitsDistillation: Most common approach. Transfers final output knowledge.
-      Best for classification tasks where you want the student to learn the teacher's
+    - `LogitsDistillation`: Most common approach. Transfers final output knowledge.
+      Use for classification tasks where you want the student to learn the teacher's
       decision boundaries and confidence patterns.
 
-    - FeatureDistillation: Transfers intermediate representations. Best when teacher
+    - `FeatureDistillation`: Transfers intermediate representations. Use when teacher
       and student have similar architectures, as it helps the student learn better
       internal representations. Often leads to better performance than logits-only.
 
-    - MultiOutputDistillation: For complex models with multiple outputs (e.g.,
+    - `MultiOutputDistillation`: For models with multiple outputs (e.g.,
       object detection with classification and regression heads). Allows different
       distillation strategies for different outputs.
 
     Args:
-        teacher: A trained keras.Model that serves as the knowledge source.
+        teacher: A trained `keras.Model` that serves as the knowledge source.
             The teacher model is frozen during distillation.
-        student: A keras.Model to be trained through distillation. This model
+        student: A `keras.Model` to be trained through distillation. This model
             will learn from both ground truth labels and the teacher's predictions.
         strategy: Distillation strategy or list of strategies. Can be a single
-            strategy (e.g., LogitsDistillation) or a list of strategies for
+            strategy (e.g., `LogitsDistillation`) or a list of strategies for
             multi-strategy distillation.
         student_loss_weight: Weight for the student's supervised loss component.
             Must be between 0 and 1. Higher values emphasize ground truth labels,
             lower values emphasize teacher predictions. Defaults to 0.5.
         optimizer: Optimizer for training the student model. Can be a string
-            identifier (e.g., 'adam') or an optimizer instance.
+            identifier (e.g., `'adam'`) or an optimizer instance.
         student_loss: Loss function for the student's supervised learning component.
             Can be a string identifier or a loss function instance.
         metrics: List of metrics to track during training.
-        name: Name for the distiller model. Defaults to "distiller".
-        **kwargs: Additional keyword arguments passed to the parent Model class.
+        name: Name for the distiller model. Defaults to `"distiller"`.
+        **kwargs: Additional keyword arguments passed to the parent `Model` class.
 
     Example:
 
@@ -147,6 +147,15 @@ def __init__(
         self.teacher = teacher
         self.student = student
         self.student_loss_weight = student_loss_weight
+        
+        # Convert string loss to function if needed
+        if isinstance(student_loss, str):
+            self._student_loss = keras.losses.get(student_loss)
+        elif isinstance(student_loss, list):
+            # Handle multi-output loss functions
+            self._student_loss = [keras.losses.get(loss) if isinstance(loss, str) else loss for loss in student_loss]
+        else:
+            self._student_loss = student_loss
 
         # Handle strategy input - can be single strategy or list
         if isinstance(strategy, list):
@@ -217,65 +226,83 @@ def call(self, inputs, training=None, **kwargs):
         """Forward pass returns student predictions."""
         return self.student(inputs, training=training, **kwargs)
 
-    def _get_strategy_outputs(self, strategy, inputs, training=None):
-        """Get the appropriate outputs for a specific strategy.
-
-        For FeatureDistillation, this extracts intermediate features.
-        For other strategies, this returns the final model outputs.
-        """
-        from keras.src.distillation.strategies import FeatureDistillation
-
-        if isinstance(strategy, FeatureDistillation):
-            # Extract features from specified intermediate layers
-            teacher_features = strategy._get_teacher_features(
-                self.teacher, inputs
-            )
-            student_features = strategy._get_student_features(
-                self.student, inputs
-            )
-            return teacher_features, student_features
-        else:
-            # Use final model outputs for other strategies
-            teacher_outputs = self.teacher(inputs, training=False)
-            student_outputs = self.student(inputs, training=training)
-            return teacher_outputs, student_outputs
-
-    def _compute_loss(
+    def compute_loss(
         self, x=None, y=None, y_pred=None, sample_weight=None, training=None
     ):
         """Compute combined distillation loss.
 
         This method integrates distillation into Keras's standard training
-        workflow.
-        """
-        # Get student predictions
-        if y_pred is None:
-            y_pred = self(x, training=training)
+        workflow. Users can override this method to implement custom distillation
+        loss computation.
+
+        Args:
+            x: Input data.
+            y: Target data.
+            y_pred: Model predictions.
+            sample_weight: Sample weights.
+            training: Whether the model is in training mode.
+
+        Returns:
+            Combined loss tensor.
 
+        Example:
+            ```python
+            # Custom distillation loss by overriding compute_loss
+            class CustomDistiller(Distiller):
+                def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None, training=None):
+                    # Custom student loss computation
+                    student_loss = keras.losses.sparse_categorical_crossentropy(y, y_pred)
+                    
+                    # Custom distillation loss computation
+                    teacher_outputs = self.teacher(x, training=False)
+                    student_outputs = self.student(x, training=training)
+                    
+                    # Custom loss logic here
+                    distillation_loss = self._custom_distillation_loss(teacher_outputs, student_outputs)
+                    
+                    # Combine losses with custom weighting
+                    total_loss = 0.7 * student_loss + 0.3 * distillation_loss
+                    
+                    return total_loss
+                
+                def _custom_distillation_loss(self, teacher_outputs, student_outputs):
+                    # Implement custom distillation loss logic
+                    from keras import ops
+                    return ops.mean(ops.square(teacher_outputs - student_outputs))
+            ```
+        """
         # Normalize y_pred and y to lists for consistent handling
         if not isinstance(y_pred, (list, tuple)):
             y_pred = [y_pred]
         if y is not None and not isinstance(y, (list, tuple)):
             y = [y]
 
-        # Compute student loss
+        # Compute student loss using basic loss function
         student_loss = 0.0
         if self.student_loss_weight > 0.0 and y is not None:
-            # Try using compiled_loss first, fallback to default loss
-            if (
-                hasattr(self, "compiled_loss")
-                and self.compiled_loss is not None
-            ):
-                student_loss = self.compiled_loss(
-                    y,
-                    y_pred,
-                    sample_weight=sample_weight,
-                    regularization_losses=[],
-                )
+            # Use the configured loss function
+            if hasattr(self, '_student_loss'):
+                if isinstance(self._student_loss, list):
+                    # Multi-output loss
+                    if isinstance(y_pred, list) and len(y_pred) > 0:
+                        student_loss = sum(
+                            loss_fn(y[i], y_pred[i]) 
+                            for i, loss_fn in enumerate(self._student_loss)
+                            if i < len(y_pred)
+                        )
+                    else:
+                        # Single output with multi-output loss list
+                        student_loss = self._student_loss[0](y[0], y_pred[0])
+                else:
+                    # Single loss function
+                    if isinstance(y_pred, list) and len(y_pred) > 0:
+                        # For multi-output, use first output for student loss
+                        student_loss = self._student_loss(y[0], y_pred[0])
+                    else:
+                        student_loss = self._student_loss(y, y_pred)
             else:
-                # Fallback: use default loss function
+                # Fallback to default
                 if isinstance(y_pred, list) and len(y_pred) > 0:
-                    # For multi-output, use first output for student loss
                     student_loss = keras.losses.sparse_categorical_crossentropy(
                         y[0], y_pred[0]
                     )
@@ -287,16 +314,14 @@ def _compute_loss(
         # Compute distillation loss
         distillation_loss = 0.0
         if self.student_loss_weight < 1.0:
+            # Get teacher outputs
+            teacher_outputs = self.teacher(x, training=False)
+            
             for strategy in self.strategies:
-                # Get appropriate outputs for this strategy
-                teacher_outputs, student_outputs = self._get_strategy_outputs(
-                    strategy, x, training=training
-                )
-
                 # Validate and compute loss for this strategy
-                strategy.validate_outputs(teacher_outputs, student_outputs)
+                strategy.validate_outputs(teacher_outputs, y_pred)
                 strategy_loss = strategy.compute_loss(
-                    teacher_outputs, student_outputs
+                    teacher_outputs, y_pred
                 )
                 distillation_loss += strategy_loss
 
@@ -313,6 +338,8 @@ def _compute_loss(
 
         return total_loss
 
+
+
     def reset_metrics(self):
         """Reset all metrics."""
         super().reset_metrics()
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 3541767b2f97..e590c1cea16d 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -8,11 +8,11 @@ class BaseDistillationStrategy:
 
     Distillation strategies define how to compute the distillation loss
     between teacher and student outputs. Each strategy implements a specific
-    approach to knowledge transfer, from simple logits matching to complex
-    multi-output distillation.
+    approach to knowledge transfer, from simple logits matching to multi-output
+    distillation.
 
     To create custom distillation strategies, subclass this class and
-    override the compute_loss method.
+    override the `compute_loss` method.
     """
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
@@ -69,7 +69,7 @@ class LogitsDistillation(BaseDistillationStrategy):
 
     How Logits Distillation Works:
 
-    1. Temperature Scaling: The teacher's logits are divided by a temperature
+    1. Temperature Scaling: The teacher's logits are divided by a `temperature`
        parameter (typically 3-5) before applying softmax. This creates "softer"
        probability distributions that reveal relationships between classes.
 
@@ -95,8 +95,8 @@ class LogitsDistillation(BaseDistillationStrategy):
             softer probability distributions that are easier for the student to learn.
             Typical values range from 3-5. Defaults to 3.0.
         loss_type: Type of loss function to use. Options:
-            - "kl_divergence": KL divergence between teacher and student distributions
-            - "categorical_crossentropy": Crossentropy with teacher as target
+            - `"kl_divergence"`: KL divergence between teacher and student distributions
+            - `"categorical_crossentropy"`: Crossentropy with teacher as target
         output_index: Index of the output to use for multi-output models.
             Defaults to 0.
 
@@ -112,6 +112,25 @@ class LogitsDistillation(BaseDistillationStrategy):
         loss_type="categorical_crossentropy"
     )
 
+    # Custom loss by subclassing
+    class CustomLogitsDistillation(LogitsDistillation):
+        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+            from keras import ops
+            # Get the outputs to distill
+            teacher_logits = teacher_outputs[self.output_index]
+            student_logits = student_outputs[self.output_index]
+            
+            # Apply temperature scaling
+            teacher_logits = teacher_logits / self.temperature
+            student_logits = student_logits / self.temperature
+            
+            # Custom loss computation
+            teacher_probs = ops.softmax(teacher_logits, axis=-1)
+            student_probs = ops.softmax(student_logits, axis=-1)
+            return ops.mean(keras.losses.kl_divergence(teacher_probs, student_probs))
+    
+    strategy = CustomLogitsDistillation(temperature=3.0)
+
     # For multi-output models
     strategy = LogitsDistillation(
         temperature=3.0,
@@ -121,16 +140,22 @@ class LogitsDistillation(BaseDistillationStrategy):
     """
 
     def __init__(
-        self, temperature=3.0, loss_type="kl_divergence", output_index=0
+        self,
+        temperature=3.0,
+        loss_type="kl_divergence",
+        output_index=0,
     ):
+        super().__init__()
         self.temperature = temperature
         self.loss_type = loss_type
         self.output_index = output_index
 
         # Validate loss_type
-        valid_loss_types = ["kl_divergence", "categorical_crossentropy"]
-        if loss_type not in valid_loss_types:
-            raise ValueError(f"loss_type must be one of {valid_loss_types}")
+        if loss_type not in ["kl_divergence", "categorical_crossentropy"]:
+            raise ValueError(
+                f"loss_type must be one of ['kl_divergence', 'categorical_crossentropy'], "
+                f"got {loss_type}"
+            )
 
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
@@ -242,7 +267,7 @@ class FeatureDistillation(BaseDistillationStrategy):
     Feature distillation transfers knowledge from intermediate layers of the
     teacher model to corresponding layers of the student model. This approach
     helps the student learn better internal representations and often leads
-    to superior performance compared to logits-only distillation.
+    to better performance compared to logits-only distillation.
 
     How Feature Distillation Works:
 
@@ -251,8 +276,8 @@ class FeatureDistillation(BaseDistillationStrategy):
        compatible architectures or similar semantic meaning.
 
     2. Feature Extraction: Extract activations from the specified layers
-       during forward pass. The teacher features are computed with training=False
-       (frozen), while student features are computed with training=True.
+       during forward pass. The teacher features are computed with `training=False`
+       (frozen), while student features are computed with `training=True`.
 
     3. Loss Computation: Compute loss between teacher and student features
        using either MSE (for identical shapes) or cosine similarity (for
@@ -278,16 +303,16 @@ class FeatureDistillation(BaseDistillationStrategy):
 
     Loss Type Selection:
 
-    - MSE: Use when teacher and student features have identical shapes.
+    - `"mse"`: Use when teacher and student features have identical shapes.
       Provides direct feature matching.
-    - Cosine Similarity: Use when features have different shapes but
+    - `"cosine"`: Use when features have different shapes but
       same feature dimension (last axis). Focuses on feature direction
       rather than magnitude.
 
     Args:
         loss_type: Type of loss function to use. Options:
-            - "mse": Mean squared error between teacher and student features
-            - "cosine": Cosine similarity between feature vectors
+            - `"mse"`: Mean squared error between teacher and student features
+            - `"cosine"`: Cosine similarity between feature vectors
         teacher_layer_name: Name of the teacher layer to extract features from.
             If None, uses the final output. Defaults to None.
         student_layer_name: Name of the student layer to extract features from.
@@ -299,6 +324,22 @@ class FeatureDistillation(BaseDistillationStrategy):
     # Basic feature distillation from final outputs
     strategy = FeatureDistillation(loss_type="mse")
 
+    # Custom loss by subclassing
+    class CustomFeatureDistillation(FeatureDistillation):
+        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+            from keras import ops
+            # Use first output by default
+            teacher_features = teacher_outputs[0]
+            student_features = student_outputs[0]
+            
+            # Custom L1 loss for feature distillation
+            return ops.mean(ops.abs(teacher_features - student_features))
+    
+    strategy = CustomFeatureDistillation(
+        teacher_layer_name="dense_1",
+        student_layer_name="dense_1"
+    )
+
     # Distill from specific layers with compatible shapes
     strategy = FeatureDistillation(
         loss_type="mse",
@@ -340,8 +381,8 @@ def __init__(
 
     def _get_teacher_features(self, teacher_model, inputs):
         """Extract features from teacher model."""
+        # No specific layer, use the final model output
         if self.teacher_layer_name is None:
-            # No specific layer, use the full model
             return teacher_model(inputs, training=False)
 
         # For intermediate layer extraction, we need to create a custom function
@@ -369,8 +410,8 @@ def _get_teacher_features(self, teacher_model, inputs):
 
     def _get_student_features(self, student_model, inputs):
         """Extract features from student model."""
+        # No specific layer, use the final model output
         if self.student_layer_name is None:
-            # No specific layer, use the full model
             return student_model(inputs, training=True)
 
         # For intermediate layer extraction, we need to create a custom function
@@ -421,6 +462,8 @@ def _create_feature_extractor(self, model, layer_name):
         if target_layer is None:
             raise ValueError(
                 f"Layer '{layer_name}' not found in model. "
+                f"This may happen with a subclassed model that cannot be "
+                f"traversed using the standard layer API. "
                 f"Available layers: {[layer.name for layer in model.layers]}"
             )
 
@@ -452,8 +495,8 @@ def _create_feature_extractor(self, model, layer_name):
                 raise ValueError(
                     f"Could not create a feature extraction model for layer "
                     f"'{layer_name}'. This is likely because the model is a "
-                    f"subclassed model with a complex topology that cannot be "
-                    f"introspected. Error: {e}"
+                    f"subclassed model that cannot be traversed using the "
+                    f"standard layer API. Error: {e}"
                 )
 
     def validate_outputs(self, teacher_outputs, student_outputs):
@@ -532,7 +575,7 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
         if self.loss_type == "mse":
             # Use Keras MeanSquaredError directly and reduce to scalar
-            return ops.mean(
+            loss = ops.mean(
                 keras.losses.mean_squared_error(
                     teacher_features, student_features
                 )
@@ -547,11 +590,13 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
                 )
             )
             # Convert similarity to distance: distance = 1 - similarity
-            return 1.0 - similarity
+            loss = 1.0 - similarity
 
         else:
             raise ValueError(f"Unknown loss_type: {self.loss_type}")
 
+        return loss
+
     def get_config(self):
         """Get configuration for serialization."""
         return {
@@ -568,7 +613,7 @@ def from_config(cls, config):
 
 @keras_export("keras.distillation.MultiOutputDistillation")
 class MultiOutputDistillation(BaseDistillationStrategy):
-    """Multi-output distillation strategy for complex models.
+    """Multi-output distillation strategy for models with multiple outputs.
 
     Multi-output distillation handles models with multiple outputs, such as
     object detection models (classification + regression), multi-task learning
@@ -591,17 +636,18 @@ class MultiOutputDistillation(BaseDistillationStrategy):
 
     - Object Detection: Models with classification and bounding box regression
     - Multi-Task Learning: Models that predict multiple related tasks
-    - Complex Architectures: Models with multiple prediction heads
+    - Multiple Prediction Heads: Models with multiple outputs
     - Different Output Types: When outputs have different characteristics
       (e.g., categorical vs continuous)
 
     Output Strategy Selection:
 
-    - Classification Outputs: Use LogitsDistillation with appropriate temperature
-    - Regression Outputs: Use LogitsDistillation with lower temperature or
-      FeatureDistillation with MSE loss
-    - Feature Outputs: Use FeatureDistillation to transfer intermediate representations
+    - Classification Outputs: Use `LogitsDistillation` with appropriate temperature
+    - Regression Outputs: Use `LogitsDistillation` with lower temperature or
+      `FeatureDistillation` with MSE loss
+    - Feature Outputs: Use `FeatureDistillation` to transfer intermediate representations
     - Mixed Types: Combine different strategies for different outputs
+    - Custom Losses: Each strategy can be subclassed to override `compute_loss` method
 
     Weight Configuration:
 
@@ -613,10 +659,10 @@ class MultiOutputDistillation(BaseDistillationStrategy):
     Args:
         output_strategies: Dict mapping output indices to distillation
             strategies. Each strategy will be applied to the corresponding output.
-            Example: {0: LogitsDistillation(), 1: FeatureDistillation()}
+            Example: `{0: LogitsDistillation(), 1: FeatureDistillation()}`
         weights: Dict mapping output indices to weights for combining losses.
             If None, all outputs are weighted equally. Defaults to None.
-            Example: {0: 1.0, 1: 0.5}  # First output twice as important
+            Example: `{0: 1.0, 1: 0.5}`  # First output twice as important
 
     Examples:
 
@@ -630,15 +676,29 @@ class MultiOutputDistillation(BaseDistillationStrategy):
         weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
     )
 
-    # Multi-task learning with different strategies
+    # Multi-task learning with custom strategies
+    class CustomLogitsDistillation(LogitsDistillation):
+        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+            from keras import ops
+            teacher_logits = teacher_outputs[self.output_index]
+            student_logits = student_outputs[self.output_index]
+            teacher_logits = teacher_logits / self.temperature
+            student_logits = student_logits / self.temperature
+            teacher_probs = ops.softmax(teacher_logits, axis=-1)
+            student_probs = ops.softmax(student_logits, axis=-1)
+            return ops.mean(keras.losses.kl_divergence(teacher_probs, student_probs))
+    
+    class CustomFeatureDistillation(FeatureDistillation):
+        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+            from keras import ops
+            teacher_features = teacher_outputs[0]
+            student_features = student_outputs[0]
+            return ops.mean(ops.abs(teacher_features - student_features))
+    
     strategy = MultiOutputDistillation(
         output_strategies={
-            0: LogitsDistillation(temperature=4.0, output_index=0),  # Task 1
-            1: FeatureDistillation(
-                loss_type="mse",
-                teacher_layer_name="dense_1",
-                student_layer_name="dense_1"
-            )  # Task 2
+            0: CustomLogitsDistillation(temperature=4.0, output_index=0),
+            1: CustomFeatureDistillation(output_index=1)
         }
     )
 
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 95054d7f4c5a..406a82c45e88 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -249,12 +249,9 @@ def test_output_validation(self):
     def test_get_config(self):
         """Test get_config method."""
         strategy = LogitsDistillation(
-            temperature=3.0,
-            loss_type="categorical_crossentropy",
-            output_index=1,
+            temperature=3.0, loss_type="categorical_crossentropy", output_index=1
         )
         config = strategy.get_config()
-
         expected_config = {
             "temperature": 3.0,
             "loss_type": "categorical_crossentropy",
@@ -264,16 +261,17 @@ def test_get_config(self):
 
     def test_serialization(self):
         """Test strategy serialization and deserialization."""
-        import json
-
-        strategy = LogitsDistillation(
-            temperature=4.0,
-            loss_type="categorical_crossentropy",
-            output_index=1,
-        )
-
-        # Test get_config
-        config = strategy.get_config()
+        original_strategy = LogitsDistillation(
+            temperature=4.0, loss_type="categorical_crossentropy", output_index=1
+        )
+        config = original_strategy.get_config()
+        reconstructed_strategy = LogitsDistillation.from_config(config)
+        
+        self.assertEqual(original_strategy.temperature, reconstructed_strategy.temperature)
+        self.assertEqual(original_strategy.loss_type, reconstructed_strategy.loss_type)
+        self.assertEqual(original_strategy.output_index, reconstructed_strategy.output_index)
+        
+        # Test config matches expected
         expected_config = {
             "temperature": 4.0,
             "loss_type": "categorical_crossentropy",
@@ -281,16 +279,6 @@ def test_serialization(self):
         }
         self.assertEqual(config, expected_config)
 
-        # Test JSON serialization
-        json_str = json.dumps(config)
-        self.assertIsInstance(json_str, str)
-
-        # Test from_config
-        reconstructed = LogitsDistillation.from_config(config)
-        self.assertEqual(reconstructed.temperature, 4.0)
-        self.assertEqual(reconstructed.loss_type, "categorical_crossentropy")
-        self.assertEqual(reconstructed.output_index, 1)
-
 
 @pytest.mark.requires_trainable_backend
 class TestFeatureDistillation(TestCase):
@@ -552,7 +540,6 @@ def test_get_config(self):
             teacher_layer_name="teacher_layer",
             student_layer_name="student_layer",
         )
-
         config = strategy.get_config()
         expected_config = {
             "loss_type": "cosine",
@@ -563,16 +550,19 @@ def test_get_config(self):
 
     def test_serialization(self):
         """Test strategy serialization and deserialization."""
-        import json
-
-        strategy = FeatureDistillation(
+        original_strategy = FeatureDistillation(
             loss_type="cosine",
             teacher_layer_name="teacher_layer",
             student_layer_name="student_layer",
         )
-
-        # Test get_config
-        config = strategy.get_config()
+        config = original_strategy.get_config()
+        reconstructed_strategy = FeatureDistillation.from_config(config)
+        
+        self.assertEqual(original_strategy.loss_type, reconstructed_strategy.loss_type)
+        self.assertEqual(original_strategy.teacher_layer_name, reconstructed_strategy.teacher_layer_name)
+        self.assertEqual(original_strategy.student_layer_name, reconstructed_strategy.student_layer_name)
+        
+        # Test config matches expected
         expected_config = {
             "loss_type": "cosine",
             "teacher_layer_name": "teacher_layer",
@@ -580,16 +570,6 @@ def test_serialization(self):
         }
         self.assertEqual(config, expected_config)
 
-        # Test JSON serialization
-        json_str = json.dumps(config)
-        self.assertIsInstance(json_str, str)
-
-        # Test from_config
-        reconstructed = FeatureDistillation.from_config(config)
-        self.assertEqual(reconstructed.loss_type, "cosine")
-        self.assertEqual(reconstructed.teacher_layer_name, "teacher_layer")
-        self.assertEqual(reconstructed.student_layer_name, "student_layer")
-
 
 @pytest.mark.requires_trainable_backend
 class TestMultiOutputDistillation(TestCase):

From 387595a85fd5763f53bac55265c0bdf72c208543 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Fri, 15 Aug 2025 17:14:51 -0700
Subject: [PATCH 13/31] run pre-commit

---
 keras/src/distillation/distiller.py       | 122 ++++++++++++----------
 keras/src/distillation/distiller_test.py  |   1 -
 keras/src/distillation/strategies.py      | 117 ++++++++++++---------
 keras/src/distillation/strategies_test.py |  48 ++++++---
 4 files changed, 168 insertions(+), 120 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index ef8b90d4c3fd..584cef4ba390 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -5,17 +5,18 @@
 
 @keras_export("keras.distillation.Distiller")
 class Distiller(Model):
-    """Knowledge Distillation model for transferring knowledge from teacher to student.
+    """Distillation model for transferring knowledge from teacher to student.
 
-    Knowledge distillation transfers knowledge from a large, complex model (`teacher`)
-    to a smaller, simpler model (`student`). The student learns from both ground truth
-    labels and the teacher's predictions, often achieving better performance than
-    training on labels alone.
+    Knowledge distillation transfers knowledge from a large, complex model
+    (`teacher`) to a smaller, simpler model (`student`). The student learns
+    from both ground truth labels and the teacher's predictions, often
+    achieving better performance than training on labels alone.
 
     How Knowledge Distillation Works:
 
-    1. Teacher Model: A pre-trained, larger model that has learned complex patterns
-       and relationships in the data. The teacher is frozen during distillation.
+    1. Teacher Model: A pre-trained, larger model that has learned complex
+       patterns and relationships in the data. The teacher is frozen during
+       distillation.
 
     2. Student Model: A smaller, simpler model that we want to train to mimic
        the teacher's behavior while being more efficient for deployment.
@@ -26,58 +27,64 @@ class Distiller(Model):
          about class relationships and confidence levels
 
     4. Temperature Scaling: The teacher's logits are divided by a `temperature`
-       parameter before applying softmax, creating "softer" probability distributions
-       that are easier for the student to learn from.
+       parameter before applying softmax, creating "softer" probability
+       distributions that are easier for the student to learn from.
 
     When to Use Knowledge Distillation:
 
-    - Model Compression: Reduce model size for deployment on resource-constrained devices
-    - Performance Improvement: Student models often outperform models trained only on labels
+    - Model Compression: Reduce model size for deployment on
+      resource-constrained devices
+    - Performance Improvement: Student models often outperform models trained
+      only on labels
     - Transfer Learning: Leverage knowledge from large pre-trained models
-    - Ensemble Distillation: Combine multiple teacher models into a single student
+    - Ensemble Distillation: Combine multiple teacher models into a single
+      student
 
     Strategy Selection Guide:
 
-    - `LogitsDistillation`: Most common approach. Transfers final output knowledge.
-      Use for classification tasks where you want the student to learn the teacher's
-      decision boundaries and confidence patterns.
+    - `LogitsDistillation`: Most common approach. Transfers final output
+      knowledge. Use for classification tasks where you want the student to
+      learn the teacher's decision boundaries and confidence patterns.
 
-    - `FeatureDistillation`: Transfers intermediate representations. Use when teacher
-      and student have similar architectures, as it helps the student learn better
-      internal representations. Often leads to better performance than logits-only.
+    - `FeatureDistillation`: Transfers intermediate representations. Use when
+      teacher and student have similar architectures, as it helps the student
+      learn better internal representations. Often leads to better performance
+      than logits-only.
 
     - `MultiOutputDistillation`: For models with multiple outputs (e.g.,
-      object detection with classification and regression heads). Allows different
-      distillation strategies for different outputs.
+      object detection with classification and regression heads). Allows
+      different distillation strategies for different outputs.
 
     Args:
         teacher: A trained `keras.Model` that serves as the knowledge source.
             The teacher model is frozen during distillation.
         student: A `keras.Model` to be trained through distillation. This model
-            will learn from both ground truth labels and the teacher's predictions.
+            will learn from both ground truth labels and the teacher's
+            predictions.
         strategy: Distillation strategy or list of strategies. Can be a single
             strategy (e.g., `LogitsDistillation`) or a list of strategies for
             multi-strategy distillation.
         student_loss_weight: Weight for the student's supervised loss component.
-            Must be between 0 and 1. Higher values emphasize ground truth labels,
-            lower values emphasize teacher predictions. Defaults to 0.5.
+            Must be between 0 and 1. Higher values emphasize ground truth
+            labels, lower values emphasize teacher predictions. Defaults to 0.5.
         optimizer: Optimizer for training the student model. Can be a string
             identifier (e.g., `'adam'`) or an optimizer instance.
-        student_loss: Loss function for the student's supervised learning component.
-            Can be a string identifier or a loss function instance.
+        student_loss: Loss function for the student's supervised learning
+            component. Can be a string identifier or a loss function instance.
         metrics: List of metrics to track during training.
         name: Name for the distiller model. Defaults to `"distiller"`.
-        **kwargs: Additional keyword arguments passed to the parent `Model` class.
+        **kwargs: Additional keyword arguments passed to the parent `Model`
+            class.
 
     Example:
 
     ```python
     # Load pre-trained teacher model from KerasHub
     import keras_hub as hub
-    
+
     teacher = hub.models.CausalLM.from_preset("gemma3_4b_en")
     student = hub.models.CausalLM.from_preset("gemma2_2b_en")
-    
+
     # Create distillation strategy
     strategy = LogitsDistillation(temperature=3.0)
 
@@ -105,8 +112,8 @@ class Distiller(Model):
     # Create multi-output strategy
     multi_strategy = MultiOutputDistillation(
         output_strategies={
-            0: LogitsDistillation(temperature=3.0, output_index=0),  # Classification
-            1: LogitsDistillation(temperature=2.0, output_index=1)   # Regression
+            0: LogitsDistillation(temperature=3.0, output_index=0),
+            1: LogitsDistillation(temperature=2.0, output_index=1)
         },
         weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
     )
@@ -147,13 +154,16 @@ def __init__(
         self.teacher = teacher
         self.student = student
         self.student_loss_weight = student_loss_weight
-        
+
         # Convert string loss to function if needed
         if isinstance(student_loss, str):
             self._student_loss = keras.losses.get(student_loss)
         elif isinstance(student_loss, list):
             # Handle multi-output loss functions
-            self._student_loss = [keras.losses.get(loss) if isinstance(loss, str) else loss for loss in student_loss]
+            self._student_loss = [
+                keras.losses.get(loss) if isinstance(loss, str) else loss
+                for loss in student_loss
+            ]
         else:
             self._student_loss = student_loss
 
@@ -232,8 +242,8 @@ def compute_loss(
         """Compute combined distillation loss.
 
         This method integrates distillation into Keras's standard training
-        workflow. Users can override this method to implement custom distillation
-        loss computation.
+        workflow. Users can override this method to implement custom
+        distillation loss computation.
 
         Args:
             x: Input data.
@@ -249,26 +259,34 @@ def compute_loss(
             ```python
             # Custom distillation loss by overriding compute_loss
             class CustomDistiller(Distiller):
-                def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None, training=None):
+                def compute_loss(self, x=None, y=None, y_pred=None,
+                               sample_weight=None, training=None):
                     # Custom student loss computation
-                    student_loss = keras.losses.sparse_categorical_crossentropy(y, y_pred)
-                    
+                    student_loss = keras.losses.sparse_categorical_crossentropy(
+                        y, y_pred
+                    )
+
                     # Custom distillation loss computation
                     teacher_outputs = self.teacher(x, training=False)
                     student_outputs = self.student(x, training=training)
-                    
+
                     # Custom loss logic here
-                    distillation_loss = self._custom_distillation_loss(teacher_outputs, student_outputs)
-                    
+                    distillation_loss = self._custom_distillation_loss(
+                        teacher_outputs, student_outputs
+                    )
+
                     # Combine losses with custom weighting
                     total_loss = 0.7 * student_loss + 0.3 * distillation_loss
-                    
+
                     return total_loss
-                
-                def _custom_distillation_loss(self, teacher_outputs, student_outputs):
+
+                def _custom_distillation_loss(self, teacher_outputs,
+                                            student_outputs):
                     # Implement custom distillation loss logic
                     from keras import ops
-                    return ops.mean(ops.square(teacher_outputs - student_outputs))
+                    return ops.mean(
+                        ops.square(teacher_outputs - student_outputs)
+                    )
             ```
         """
         # Normalize y_pred and y to lists for consistent handling
@@ -281,12 +299,12 @@ def _custom_distillation_loss(self, teacher_outputs, student_outputs):
         student_loss = 0.0
         if self.student_loss_weight > 0.0 and y is not None:
             # Use the configured loss function
-            if hasattr(self, '_student_loss'):
+            if hasattr(self, "_student_loss"):
                 if isinstance(self._student_loss, list):
                     # Multi-output loss
                     if isinstance(y_pred, list) and len(y_pred) > 0:
                         student_loss = sum(
-                            loss_fn(y[i], y_pred[i]) 
+                            loss_fn(y[i], y_pred[i])
                             for i, loss_fn in enumerate(self._student_loss)
                             if i < len(y_pred)
                         )
@@ -316,13 +334,11 @@ def _custom_distillation_loss(self, teacher_outputs, student_outputs):
         if self.student_loss_weight < 1.0:
             # Get teacher outputs
             teacher_outputs = self.teacher(x, training=False)
-            
+
             for strategy in self.strategies:
-                # Validate and compute loss for this strategy
-                strategy.validate_outputs(teacher_outputs, y_pred)
-                strategy_loss = strategy.compute_loss(
-                    teacher_outputs, y_pred
-                )
+                # Compute loss for this strategy (validation happens inside
+                # strategy)
+                strategy_loss = strategy.compute_loss(teacher_outputs, y_pred)
                 distillation_loss += strategy_loss
 
         # Combine losses
@@ -338,8 +354,6 @@ def _custom_distillation_loss(self, teacher_outputs, student_outputs):
 
         return total_loss
 
-
-
     def reset_metrics(self):
         """Reset all metrics."""
         super().reset_metrics()
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 8498c138ea2b..6e0a04ec88c2 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -141,7 +141,6 @@ def test_model_compatibility_validation(self):
             )
 
     def test_student_loss_weighting(self):
-        """Test that student_loss_weight correctly weights student vs distillation loss."""
         # Test with student_loss_weight = 0.0 (only distillation loss)
         distiller_0 = Distiller(
             teacher=self.teacher,
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index e590c1cea16d..a4caf9b23f3c 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -61,11 +61,11 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
 @keras_export("keras.distillation.LogitsDistillation")
 class LogitsDistillation(BaseDistillationStrategy):
-    """Distillation strategy that transfers knowledge from final model outputs (logits).
+    """Distillation strategy that transfers knowledge from final model outputs.
 
-    This strategy applies temperature scaling to the teacher's logits before computing
-    the loss between teacher and student predictions. It's the most common approach
-    for knowledge distillation.
+    This strategy applies temperature scaling to the teacher's logits before
+    computing the loss between teacher and student predictions. It's the most
+    common approach for knowledge distillation.
 
     How Logits Distillation Works:
 
@@ -74,13 +74,14 @@ class LogitsDistillation(BaseDistillationStrategy):
        probability distributions that reveal relationships between classes.
 
     2. Loss Computation: The loss is computed between the temperature-scaled
-       teacher logits and student logits using either KL divergence or categorical
-       crossentropy.
+       teacher logits and student logits using either KL divergence or
+       categorical crossentropy.
 
     When to Use Logits Distillation:
 
     - General Classification: Works well for most classification tasks
-    - Model Compression: Effective for reducing model size while maintaining accuracy
+    - Model Compression: Effective for reducing model size while maintaining
+      accuracy
     - Transfer Learning: Good for leveraging knowledge from pre-trained models
     - Ensemble Distillation: Can combine multiple teacher models
 
@@ -88,14 +89,16 @@ class LogitsDistillation(BaseDistillationStrategy):
 
     - Low Temperature (1-2): Sharp distributions, similar to hard labels
     - Medium Temperature (3-5): Balanced softness, most commonly used
-    - High Temperature (6-10): Very soft distributions, reveals subtle relationships
+    - High Temperature (6-10): Very soft distributions, reveals subtle
+      relationships
 
     Args:
         temperature: Temperature for softmax scaling. Higher values produce
-            softer probability distributions that are easier for the student to learn.
-            Typical values range from 3-5. Defaults to 3.0.
+            softer probability distributions that are easier for the student to
+            learn. Typical values range from 3-5. Defaults to 3.0.
         loss_type: Type of loss function to use. Options:
-            - `"kl_divergence"`: KL divergence between teacher and student distributions
+            - `"kl_divergence"`: KL divergence between teacher and student
+              distributions
             - `"categorical_crossentropy"`: Crossentropy with teacher as target
         output_index: Index of the output to use for multi-output models.
             Defaults to 0.
@@ -119,16 +122,18 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
             # Get the outputs to distill
             teacher_logits = teacher_outputs[self.output_index]
             student_logits = student_outputs[self.output_index]
-            
+
             # Apply temperature scaling
             teacher_logits = teacher_logits / self.temperature
             student_logits = student_logits / self.temperature
-            
+
             # Custom loss computation
             teacher_probs = ops.softmax(teacher_logits, axis=-1)
             student_probs = ops.softmax(student_logits, axis=-1)
-            return ops.mean(keras.losses.kl_divergence(teacher_probs, student_probs))
-    
+            return ops.mean(
+                keras.losses.kl_divergence(teacher_probs, student_probs)
+            )
+
     strategy = CustomLogitsDistillation(temperature=3.0)
 
     # For multi-output models
@@ -153,8 +158,8 @@ def __init__(
         # Validate loss_type
         if loss_type not in ["kl_divergence", "categorical_crossentropy"]:
             raise ValueError(
-                f"loss_type must be one of ['kl_divergence', 'categorical_crossentropy'], "
-                f"got {loss_type}"
+                f"loss_type must be one of ['kl_divergence', "
+                f"'categorical_crossentropy'], got {loss_type}"
             )
 
     def validate_outputs(self, teacher_outputs, student_outputs):
@@ -276,8 +281,9 @@ class FeatureDistillation(BaseDistillationStrategy):
        compatible architectures or similar semantic meaning.
 
     2. Feature Extraction: Extract activations from the specified layers
-       during forward pass. The teacher features are computed with `training=False`
-       (frozen), while student features are computed with `training=True`.
+       during forward pass. The teacher features are computed with
+       `training=False` (frozen), while student features are computed with
+       `training=True`.
 
     3. Loss Computation: Compute loss between teacher and student features
        using either MSE (for identical shapes) or cosine similarity (for
@@ -331,10 +337,10 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
             # Use first output by default
             teacher_features = teacher_outputs[0]
             student_features = student_outputs[0]
-            
+
             # Custom L1 loss for feature distillation
             return ops.mean(ops.abs(teacher_features - student_features))
-    
+
     strategy = CustomFeatureDistillation(
         teacher_layer_name="dense_1",
         student_layer_name="dense_1"
@@ -630,71 +636,84 @@ class MultiOutputDistillation(BaseDistillationStrategy):
        pair (teacher output i → student output i).
 
     3. Loss Combination: Combine the losses from all outputs using
-       configurable weights. This allows prioritizing certain outputs over others.
+       configurable weights. This allows prioritizing certain outputs over
+       others.
 
     When to Use Multi-Output Distillation:
 
-    - Object Detection: Models with classification and bounding box regression
-    - Multi-Task Learning: Models that predict multiple related tasks
-    - Multiple Prediction Heads: Models with multiple outputs
-    - Different Output Types: When outputs have different characteristics
-      (e.g., categorical vs continuous)
+    - Multi-Task Models: Models with multiple outputs (classification +
+      regression)
+    - Object Detection: Models with classification and bounding box outputs
+    - Segmentation: Models with classification and mask outputs
+    - Custom Architectures: Any model with multiple distinct outputs
 
     Output Strategy Selection:
 
-    - Classification Outputs: Use `LogitsDistillation` with appropriate temperature
+    - Classification Outputs: Use `LogitsDistillation` with appropriate
+      temperature
     - Regression Outputs: Use `LogitsDistillation` with lower temperature or
       `FeatureDistillation` with MSE loss
-    - Feature Outputs: Use `FeatureDistillation` to transfer intermediate representations
+    - Feature Outputs: Use `FeatureDistillation` to transfer intermediate
+      representations
     - Mixed Types: Combine different strategies for different outputs
-    - Custom Losses: Each strategy can be subclassed to override `compute_loss` method
+    - Custom Losses: Each strategy can be subclassed to override
+      `compute_loss` method
 
     Weight Configuration:
 
-    - Equal Weights: All outputs contribute equally to the total loss
+    - Equal Weights: Default behavior, all outputs weighted equally
     - Task-Specific Weights: Weight outputs based on task importance
     - Loss-Scale Weights: Adjust weights to balance different loss scales
-    - Performance-Based: Weight outputs based on their impact on final performance
+    - Performance-Based: Weight outputs based on their impact on final
+      performance
 
     Args:
         output_strategies: Dict mapping output indices to distillation
-            strategies. Each strategy will be applied to the corresponding output.
-            Example: `{0: LogitsDistillation(), 1: FeatureDistillation()}`
+            strategies. Each strategy will be applied to the corresponding
+            output. Example: `{0: LogitsDistillation(), 1:
+            FeatureDistillation()}`
         weights: Dict mapping output indices to weights for combining losses.
-            If None, all outputs are weighted equally. Defaults to None.
-            Example: `{0: 1.0, 1: 0.5}`  # First output twice as important
+            Defaults to equal weights for all outputs. Example:
+            `{0: 1.0, 1: 0.5}`
 
-    Examples:
+    Example:
 
     ```python
-    # Object detection distillation (classification + regression)
+    # Multi-output distillation for object detection
     strategy = MultiOutputDistillation(
         output_strategies={
-            0: LogitsDistillation(temperature=3.0, output_index=0),  # Classification
-            1: LogitsDistillation(temperature=1.0, output_index=1)   # Regression
+            0: LogitsDistillation(temperature=3.0, output_index=0),
+            1: LogitsDistillation(temperature=1.0, output_index=1)
         },
         weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
     )
 
-    # Multi-task learning with custom strategies
-    class CustomLogitsDistillation(LogitsDistillation):
+    # Custom multi-output strategy
+    class CustomMultiOutputDistillation(MultiOutputDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
             from keras import ops
-            teacher_logits = teacher_outputs[self.output_index]
-            student_logits = student_outputs[self.output_index]
-            teacher_logits = teacher_logits / self.temperature
-            student_logits = student_logits / self.temperature
+            # Get the outputs to distill
+            teacher_logits = teacher_outputs[0]
+            student_logits = student_outputs[0]
+
+            # Apply temperature scaling
+            teacher_logits = teacher_logits / 3.0
+            student_logits = student_logits / 3.0
+
+            # Custom loss computation
             teacher_probs = ops.softmax(teacher_logits, axis=-1)
             student_probs = ops.softmax(student_logits, axis=-1)
-            return ops.mean(keras.losses.kl_divergence(teacher_probs, student_probs))
-    
+            return ops.mean(
+                keras.losses.kl_divergence(teacher_probs, student_probs)
+            )
+
     class CustomFeatureDistillation(FeatureDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
             from keras import ops
             teacher_features = teacher_outputs[0]
             student_features = student_outputs[0]
             return ops.mean(ops.abs(teacher_features - student_features))
-    
+
     strategy = MultiOutputDistillation(
         output_strategies={
             0: CustomLogitsDistillation(temperature=4.0, output_index=0),
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 406a82c45e88..4408e85f6dc5 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -249,7 +249,9 @@ def test_output_validation(self):
     def test_get_config(self):
         """Test get_config method."""
         strategy = LogitsDistillation(
-            temperature=3.0, loss_type="categorical_crossentropy", output_index=1
+            temperature=3.0,
+            loss_type="categorical_crossentropy",
+            output_index=1,
         )
         config = strategy.get_config()
         expected_config = {
@@ -262,15 +264,23 @@ def test_get_config(self):
     def test_serialization(self):
         """Test strategy serialization and deserialization."""
         original_strategy = LogitsDistillation(
-            temperature=4.0, loss_type="categorical_crossentropy", output_index=1
+            temperature=4.0,
+            loss_type="categorical_crossentropy",
+            output_index=1,
         )
         config = original_strategy.get_config()
         reconstructed_strategy = LogitsDistillation.from_config(config)
-        
-        self.assertEqual(original_strategy.temperature, reconstructed_strategy.temperature)
-        self.assertEqual(original_strategy.loss_type, reconstructed_strategy.loss_type)
-        self.assertEqual(original_strategy.output_index, reconstructed_strategy.output_index)
-        
+
+        self.assertEqual(
+            original_strategy.temperature, reconstructed_strategy.temperature
+        )
+        self.assertEqual(
+            original_strategy.loss_type, reconstructed_strategy.loss_type
+        )
+        self.assertEqual(
+            original_strategy.output_index, reconstructed_strategy.output_index
+        )
+
         # Test config matches expected
         expected_config = {
             "temperature": 4.0,
@@ -557,11 +567,19 @@ def test_serialization(self):
         )
         config = original_strategy.get_config()
         reconstructed_strategy = FeatureDistillation.from_config(config)
-        
-        self.assertEqual(original_strategy.loss_type, reconstructed_strategy.loss_type)
-        self.assertEqual(original_strategy.teacher_layer_name, reconstructed_strategy.teacher_layer_name)
-        self.assertEqual(original_strategy.student_layer_name, reconstructed_strategy.student_layer_name)
-        
+
+        self.assertEqual(
+            original_strategy.loss_type, reconstructed_strategy.loss_type
+        )
+        self.assertEqual(
+            original_strategy.teacher_layer_name,
+            reconstructed_strategy.teacher_layer_name,
+        )
+        self.assertEqual(
+            original_strategy.student_layer_name,
+            reconstructed_strategy.student_layer_name,
+        )
+
         # Test config matches expected
         expected_config = {
             "loss_type": "cosine",
@@ -750,12 +768,10 @@ def test_end_to_end_with_multi_output_models(self):
             ],
             metrics=[
                 ["accuracy"],  # Metrics for output 0
-                ["accuracy"]   # Metrics for output 1
-            ]
+                ["accuracy"],  # Metrics for output 1
+            ],
         )
 
-
-
         # Create test data for multi-output model
         x = np.random.random((20, 5)).astype(np.float32)
         # Multi-output targets: [output1_targets, output2_targets]

From 4d6610aad2ebb674115e7d3c328fc734356fc85f Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 18 Aug 2025 15:06:41 -0700
Subject: [PATCH 14/31] update distiller and strategies

---
 keras/src/distillation/distiller.py       | 360 ++++++++++--
 keras/src/distillation/distiller_test.py  |  42 +-
 keras/src/distillation/strategies.py      | 112 ++--
 keras/src/distillation/strategies_test.py | 643 ++--------------------
 4 files changed, 467 insertions(+), 690 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 584cef4ba390..27063807c492 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -55,15 +55,18 @@ class Distiller(Model):
       object detection with classification and regression heads). Allows
       different distillation strategies for different outputs.
 
+    - Custom Strategies: Create custom strategies by subclassing
+      `BaseDistillationStrategy` and overriding the `compute_loss` method.
+
     Args:
         teacher: A trained `keras.Model` that serves as the knowledge source.
             The teacher model is frozen during distillation.
         student: A `keras.Model` to be trained through distillation. This model
             will learn from both ground truth labels and the teacher's
             predictions.
-        strategy: Distillation strategy or list of strategies. Can be a single
-            strategy (e.g., `LogitsDistillation`) or a list of strategies for
-            multi-strategy distillation.
+        strategy: Distillation strategy to apply. Can be `LogitsDistillation`,
+            `FeatureDistillation`, `MultiOutputDistillation`, or a custom
+            strategy.
         student_loss_weight: Weight for the student's supervised loss component.
             Must be between 0 and 1. Higher values emphasize ground truth
             labels, lower values emphasize teacher predictions. Defaults to 0.5.
@@ -153,25 +156,73 @@ def __init__(
         # Store configuration
         self.teacher = teacher
         self.student = student
+
+        # Validate student_loss_weight
+        if not isinstance(student_loss_weight, (int, float)):
+            raise ValueError(
+                f"student_loss_weight must be a number, got "
+                f"{type(student_loss_weight)}"
+            )
+        if student_loss_weight < 0.0 or student_loss_weight > 1.0:
+            raise ValueError(
+                f"student_loss_weight must be between 0.0 and 1.0, "
+                f"got {student_loss_weight}"
+            )
         self.student_loss_weight = student_loss_weight
 
+        # Validate metrics parameter
+        if metrics is not None and not isinstance(metrics, (list, tuple)):
+            raise ValueError(
+                f"metrics must be a list or tuple, got {type(metrics)}"
+            )
+
         # Convert string loss to function if needed
         if isinstance(student_loss, str):
             self._student_loss = keras.losses.get(student_loss)
+            if self._student_loss is None:
+                raise ValueError(
+                    f"Unknown loss function: '{student_loss}'. "
+                    "Please provide a valid loss function name or instance."
+                )
         elif isinstance(student_loss, list):
             # Handle multi-output loss functions
-            self._student_loss = [
-                keras.losses.get(loss) if isinstance(loss, str) else loss
-                for loss in student_loss
-            ]
+            self._student_loss = []
+            for i, loss in enumerate(student_loss):
+                if isinstance(loss, str):
+                    loss_fn = keras.losses.get(loss)
+                    if loss_fn is None:
+                        raise ValueError(
+                            f"Unknown loss function at index {i}: '{loss}'. "
+                            "Please provide valid loss function names or "
+                            "instances."
+                        )
+                    self._student_loss.append(loss_fn)
+                else:
+                    self._student_loss.append(loss)
         else:
             self._student_loss = student_loss
 
-        # Handle strategy input - can be single strategy or list
-        if isinstance(strategy, list):
-            self.strategies = strategy
-        else:
-            self.strategies = [strategy]
+        # Validate that we have a valid loss function
+        if self._student_loss is None:
+            raise ValueError(
+                "Student loss function cannot be None. "
+                "Please provide a valid 'student_loss' parameter."
+            )
+
+        # Validate architecture compatibility for feature distillation
+        self._validate_architecture_compatibility(teacher, student)
+
+        # Store strategy (single strategy only)
+        if strategy is None:
+            raise ValueError(
+                "Distillation strategy cannot be None. "
+                "Please provide a valid strategy such as LogitsDistillation, "
+                "FeatureDistillation, or MultiOutputDistillation."
+            )
+        self.strategy = strategy
+
+        # Validate strategy-specific compatibility
+        self._validate_strategy_compatibility(teacher, student)
 
         # Freeze teacher model
         self.teacher.trainable = False
@@ -184,12 +235,20 @@ def __init__(
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
         # Compile the model with provided parameters
-        self.compile(
-            optimizer=optimizer, loss=student_loss, metrics=metrics or []
-        )
+        self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
 
     def _validate_models(self, teacher, student):
-        """Validate that teacher and student are Keras models."""
+        """Validate that teacher and student models are compatible for
+        distillation.
+
+        This method performs comprehensive validation including:
+        - Model type validation
+        - Input shape compatibility
+        - Output shape compatibility
+        - Architecture compatibility for feature distillation
+        - Data type compatibility
+        """
+        # Basic model type validation
         if not isinstance(teacher, keras.Model):
             raise ValueError(
                 f"Teacher must be a keras.Model, got {type(teacher)}"
@@ -199,6 +258,178 @@ def _validate_models(self, teacher, student):
                 f"Student must be a keras.Model, got {type(student)}"
             )
 
+        # Check if models are built
+        # Subclassed models may not be built at this point and may not expose
+        # symbolic `inputs`/`outputs`. We avoid hard failures here and rely on
+        # runtime checks during the first call/fit. When symbolic tensors are
+        # available, we perform full compatibility validation below.
+
+        # Validate input compatibility
+        self._validate_input_compatibility(teacher, student)
+
+        # Validate output compatibility
+        self._validate_output_compatibility(teacher, student)
+
+        # Validate data type compatibility
+        self._validate_dtype_compatibility(teacher, student)
+
+    def _validate_input_compatibility(self, teacher, student):
+        """Validate that teacher and student have compatible input shapes."""
+        # If symbolic tensors are not available (subclassed models), skip.
+        if not hasattr(teacher, "inputs") or not hasattr(student, "inputs"):
+            return
+        teacher_inputs = getattr(teacher, "inputs")
+        student_inputs = getattr(student, "inputs")
+        if teacher_inputs is None or student_inputs is None:
+            return
+
+        # Handle single input case
+        if not isinstance(teacher_inputs, (list, tuple)):
+            teacher_inputs = [teacher_inputs]
+        if not isinstance(student_inputs, (list, tuple)):
+            student_inputs = [student_inputs]
+
+        # Check number of inputs
+        if len(teacher_inputs) != len(student_inputs):
+            raise ValueError(
+                f"Teacher and student must have the same number of inputs. "
+                f"Teacher has {len(teacher_inputs)} inputs, "
+                f"student has {len(student_inputs)} inputs."
+            )
+
+        # Check input shapes
+        for i, (teacher_input, student_input) in enumerate(
+            zip(teacher_inputs, student_inputs)
+        ):
+            teacher_shape = teacher_input.shape
+            student_shape = student_input.shape
+
+            # Check if shapes are compatible (allowing for batch dimension
+            # flexibility)
+            if not self._shapes_are_compatible(teacher_shape, student_shape):
+                raise ValueError(
+                    f"Input {i} shapes are incompatible. "
+                    f"Teacher input shape: {teacher_shape}, "
+                    f"Student input shape: {student_shape}. "
+                    f"All dimensions except batch size must match."
+                )
+
+    def _validate_output_compatibility(self, teacher, student):
+        """Validate that teacher and student have compatible output shapes."""
+        # If symbolic tensors are not available (subclassed models), skip.
+        if not hasattr(teacher, "outputs") or not hasattr(student, "outputs"):
+            return
+        teacher_outputs = getattr(teacher, "outputs")
+        student_outputs = getattr(student, "outputs")
+        if teacher_outputs is None or student_outputs is None:
+            return
+
+        # Handle single output case
+        if not isinstance(teacher_outputs, (list, tuple)):
+            teacher_outputs = [teacher_outputs]
+        if not isinstance(student_outputs, (list, tuple)):
+            student_outputs = [student_outputs]
+
+        # Check number of outputs
+        if len(teacher_outputs) != len(student_outputs):
+            raise ValueError(
+                f"Teacher and student must have the same number of outputs. "
+                f"Teacher has {len(teacher_outputs)} outputs, "
+                f"student has {len(student_outputs)} outputs."
+            )
+
+        # Check output shapes
+        for i, (teacher_output, student_output) in enumerate(
+            zip(teacher_outputs, student_outputs)
+        ):
+            teacher_shape = teacher_output.shape
+            student_shape = student_output.shape
+
+            # For distillation, output shapes should be compatible
+            if not self._shapes_are_compatible(teacher_shape, student_shape):
+                raise ValueError(
+                    f"Output {i} shapes are incompatible. "
+                    f"Teacher output shape: {teacher_shape}, "
+                    f"Student output shape: {student_shape}. "
+                    f"All dimensions except batch size must match."
+                )
+
+    def _validate_dtype_compatibility(self, teacher, student):
+        """Validate that teacher and student have compatible data types."""
+        # If symbolic tensors are not available (subclassed models), skip.
+        if not hasattr(teacher, "inputs") or not hasattr(student, "inputs"):
+            return
+        if teacher.inputs is None or student.inputs is None:
+            return
+        teacher_dtypes = [input.dtype for input in teacher.inputs]
+        student_dtypes = [input.dtype for input in student.inputs]
+
+        # Check input dtypes
+        for i, (teacher_dtype, student_dtype) in enumerate(
+            zip(teacher_dtypes, student_dtypes)
+        ):
+            if teacher_dtype != student_dtype:
+                raise ValueError(
+                    f"Input {i} data types are incompatible. "
+                    f"Teacher dtype: {teacher_dtype}, "
+                    f"Student dtype: {student_dtype}."
+                )
+
+        # Check output dtypes
+        teacher_output_dtypes = [output.dtype for output in teacher.outputs]
+        student_output_dtypes = [output.dtype for output in student.outputs]
+
+        for i, (teacher_dtype, student_dtype) in enumerate(
+            zip(teacher_output_dtypes, student_output_dtypes)
+        ):
+            if teacher_dtype != student_dtype:
+                raise ValueError(
+                    f"Output {i} data types are incompatible. "
+                    f"Teacher output dtype: {teacher_dtype}, "
+                    f"Student output dtype: {student_dtype}. "
+                    f"Both models must use the same data type."
+                )
+
+    def _validate_architecture_compatibility(self, teacher, student):
+        """Validate architecture compatibility for feature distillation."""
+        # This validation is strategy-specific and will be called by strategies
+        # that require specific architectural compatibility
+        pass
+
+    def _validate_strategy_compatibility(self, teacher, student):
+        """Validate that the strategy is compatible with the teacher and student
+        models."""
+        if hasattr(self.strategy, "validate_model_compatibility"):
+            self.strategy.validate_model_compatibility(teacher, student)
+
+    def _shapes_are_compatible(self, shape1, shape2):
+        """Check if two shapes are compatible (allowing for batch dimension
+        flexibility)."""
+        # Convert to lists for easier handling
+        if hasattr(shape1, "as_list"):
+            shape1 = shape1.as_list()
+        elif hasattr(shape1, "__iter__"):
+            shape1 = list(shape1)
+        else:
+            shape1 = [shape1]
+
+        if hasattr(shape2, "as_list"):
+            shape2 = shape2.as_list()
+        elif hasattr(shape2, "__iter__"):
+            shape2 = list(shape2)
+        else:
+            shape2 = [shape2]
+
+        # Check if they have the same number of dimensions
+        if len(shape1) != len(shape2):
+            return False
+
+        # Check all dimensions except the first (batch dimension)
+        for dim1, dim2 in zip(shape1[1:], shape2[1:]):
+            if dim1 is not None and dim2 is not None and dim1 != dim2:
+                return False
+        return True
+
     def get_student_model(self):
         """Get the trained student model for independent use.
 
@@ -295,39 +526,78 @@ def _custom_distillation_loss(self, teacher_outputs,
         if y is not None and not isinstance(y, (list, tuple)):
             y = [y]
 
-        # Compute student loss using basic loss function
+        # Compute student loss
         student_loss = 0.0
         if self.student_loss_weight > 0.0 and y is not None:
             # Use the configured loss function
-            if hasattr(self, "_student_loss"):
+            if (
+                hasattr(self, "_student_loss")
+                and self._student_loss is not None
+            ):
                 if isinstance(self._student_loss, list):
                     # Multi-output loss
                     if isinstance(y_pred, list) and len(y_pred) > 0:
+                        # Validate lengths match
+                        if len(y) != len(y_pred):
+                            raise ValueError(
+                                f"Number of targets ({len(y)}) must match "
+                                f"number of predictions ({len(y_pred)}) for "
+                                f"multi-output loss computation."
+                            )
+                        if len(self._student_loss) != len(y):
+                            raise ValueError(
+                                f"Number of loss functions "
+                                f"({len(self._student_loss)}) must match "
+                                f"number of outputs ({len(y)}) for "
+                                f"multi-output loss computation."
+                            )
+
+                        # Compute loss for each output
                         student_loss = sum(
                             loss_fn(y[i], y_pred[i])
                             for i, loss_fn in enumerate(self._student_loss)
-                            if i < len(y_pred)
                         )
                     else:
                         # Single output with multi-output loss list
+                        if len(self._student_loss) != 1:
+                            raise ValueError(
+                                f"Single output provided but "
+                                f"{len(self._student_loss)} loss functions "
+                                f"configured. Use a single loss function or "
+                                f"provide multiple outputs."
+                            )
                         student_loss = self._student_loss[0](y[0], y_pred[0])
                 else:
                     # Single loss function
                     if isinstance(y_pred, list) and len(y_pred) > 0:
-                        # For multi-output, use first output for student loss
+                        # Multi-output with single loss function
+                        if len(y) != len(y_pred):
+                            raise ValueError(
+                                f"Number of targets ({len(y)}) must match "
+                                f"number of predictions ({len(y_pred)}) for "
+                                f"multi-output loss computation."
+                            )
+                        # Use first output for student loss (consistent
+                        # behavior)
                         student_loss = self._student_loss(y[0], y_pred[0])
                     else:
+                        # Single output with single loss function
                         student_loss = self._student_loss(y, y_pred)
             else:
-                # Fallback to default
-                if isinstance(y_pred, list) and len(y_pred) > 0:
-                    student_loss = keras.losses.sparse_categorical_crossentropy(
-                        y[0], y_pred[0]
-                    )
-                else:
-                    student_loss = keras.losses.sparse_categorical_crossentropy(
-                        y, y_pred
-                    )
+                # No loss function configured - this is an error
+                raise ValueError(
+                    "Student loss function is not configured. "
+                    "Please provide a valid 'student_loss' parameter to the "
+                    "Distiller constructor. "
+                    "Examples: 'sparse_categorical_crossentropy', "
+                    "'categorical_crossentropy', or a custom loss function."
+                )
+
+            # Ensure student_loss is a scalar
+            from keras import ops
+
+            if hasattr(student_loss, "shape") and len(student_loss.shape) > 0:
+                student_loss = ops.mean(student_loss)
 
         # Compute distillation loss
         distillation_loss = 0.0
@@ -335,11 +605,19 @@ def _custom_distillation_loss(self, teacher_outputs,
             # Get teacher outputs
             teacher_outputs = self.teacher(x, training=False)
 
-            for strategy in self.strategies:
-                # Compute loss for this strategy (validation happens inside
-                # strategy)
-                strategy_loss = strategy.compute_loss(teacher_outputs, y_pred)
-                distillation_loss += strategy_loss
+            # Apply the single strategy
+            distillation_loss = self.strategy.compute_loss(
+                teacher_outputs, y_pred
+            )
+
+            # Ensure distillation_loss is a scalar
+            from keras import ops
+
+            if (
+                hasattr(distillation_loss, "shape")
+                and len(distillation_loss.shape) > 0
+            ):
+                distillation_loss = ops.mean(distillation_loss)
 
         # Combine losses
         total_loss = (
@@ -383,10 +661,9 @@ def get_config(self):
                 "student": serialization_lib.serialize_keras_object(
                     self.student
                 ),
-                "strategy": [
-                    serialization_lib.serialize_keras_object(s)
-                    for s in self.strategies
-                ],
+                "strategy": serialization_lib.serialize_keras_object(
+                    self.strategy
+                ),
                 "student_loss_weight": self.student_loss_weight,
                 "input_mapping": self.input_mapping,
                 "output_mapping": self.output_mapping,
@@ -405,8 +682,7 @@ def from_config(cls, config):
         config["student"] = serialization_lib.deserialize_keras_object(
             config["student"]
         )
-        config["strategy"] = [
-            serialization_lib.deserialize_keras_object(s)
-            for s in config["strategy"]
-        ]
+        config["strategy"] = serialization_lib.deserialize_keras_object(
+            config["strategy"]
+        )
         return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 6e0a04ec88c2..6a80026532f6 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -74,16 +74,21 @@ def test_distiller_initialization(self):
         # Check student_loss_weight
         self.assertEqual(self.distiller.student_loss_weight, 0.5)
 
-        # Check strategies
-        self.assertLen(self.distiller.strategies, 1)
-        self.assertIsInstance(self.distiller.strategies[0], LogitsDistillation)
+        # Check strategy
+        self.assertIsInstance(self.distiller.strategy, LogitsDistillation)
 
         # Check that strategy has the correct temperature
-        self.assertEqual(self.distiller.strategies[0].temperature, 2.0)
+        self.assertEqual(self.distiller.strategy.temperature, 2.0)
 
         # Check that model is compiled
         self.assertIsNotNone(self.distiller.optimizer)
-        self.assertIsNotNone(self.distiller.compiled_loss)
+        # Check if the model has been compiled (different backends may handle
+        # this differently)
+        self.assertTrue(
+            hasattr(self.distiller, "_compile_config")
+            or hasattr(self.distiller, "compiled_loss"),
+            "Model should be compiled",
+        )
 
     def test_distiller_call(self):
         """Test Distiller call method (inference)."""
@@ -358,23 +363,17 @@ def test_distiller_serialization_and_saving(self):
             ]
         )
 
-        # Create distiller with multiple strategies
-        from keras.src.distillation.strategies import FeatureDistillation
+        # Create distiller with single strategy
         from keras.src.distillation.strategies import LogitsDistillation
 
-        strategies = [
-            LogitsDistillation(temperature=3.0, loss_type="kl_divergence"),
-            FeatureDistillation(
-                loss_type="mse",
-                teacher_layer_name="teacher_dense_1",
-                student_layer_name="student_dense_1",
-            ),
-        ]
+        strategy = LogitsDistillation(
+            temperature=3.0, loss_type="kl_divergence"
+        )
 
         original_distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=strategies,
+            strategy=strategy,
             student_loss_weight=0.7,
             optimizer=keras.optimizers.Adam(),
             student_loss="sparse_categorical_crossentropy",
@@ -408,19 +407,12 @@ def test_distiller_serialization_and_saving(self):
 
         # Verify reconstruction
         self.assertEqual(reconstructed_distiller.student_loss_weight, 0.7)
-        self.assertEqual(len(reconstructed_distiller.strategies), 2)
-
-        # Verify strategy types
-        self.assertIsInstance(
-            reconstructed_distiller.strategies[0], LogitsDistillation
-        )
         self.assertIsInstance(
-            reconstructed_distiller.strategies[1], FeatureDistillation
+            reconstructed_distiller.strategy, LogitsDistillation
         )
 
         # Verify strategy parameters
-        self.assertEqual(reconstructed_distiller.strategies[0].temperature, 3.0)
-        self.assertEqual(reconstructed_distiller.strategies[1].loss_type, "mse")
+        self.assertEqual(reconstructed_distiller.strategy.temperature, 3.0)
 
         # Test that reconstructed distiller can be used for inference
         reconstructed_output = reconstructed_distiller(x_test)
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index a4caf9b23f3c..9318ec8f74bd 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -53,7 +53,8 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
         if len(teacher_outputs) != len(student_outputs):
             raise ValueError(
-                f"Teacher and student must have the same number of outputs. "
+                f"Teacher and student must have the same number of "
+                f"outputs. "
                 f"Teacher has {len(teacher_outputs)} outputs, "
                 f"student has {len(student_outputs)} outputs."
             )
@@ -162,6 +163,16 @@ def __init__(
                 f"'categorical_crossentropy'], got {loss_type}"
             )
 
+        # Validate temperature
+        if not isinstance(self.temperature, (int, float)):
+            raise ValueError(
+                f"temperature must be a number, got {type(self.temperature)}"
+            )
+        if self.temperature <= 0.0:
+            raise ValueError(
+                "temperature must be > 0. Set a positive value (e.g., 1-10)."
+            )
+ 
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
@@ -234,23 +245,26 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
                 keras.losses.kl_divergence(teacher_probs, student_probs)
             )
 
+            # Scale by temperature^2 for KL (per literature)
+            return loss * (self.temperature**2)
+
         elif self.loss_type == "categorical_crossentropy":
-            # Convert teacher to probabilities, keep student as logits
+            # Convert teacher to probabilities, keep student as logits and
+            # pass from_logits=True for correct computation.
             teacher_probs = ops.softmax(teacher_logits, axis=-1)
 
-            # Use Keras CategoricalCrossentropy directly and reduce to scalar
             loss = ops.mean(
                 keras.losses.categorical_crossentropy(
-                    teacher_probs, student_logits
+                    teacher_probs, student_logits, from_logits=True
                 )
             )
 
+            # Do NOT scale by temperature^2 for categorical crossentropy
+            return loss
+
         else:
             raise ValueError(f"Unknown loss_type: {self.loss_type}")
 
-        # Scale by temperature^2 for consistency with literature
-        return loss * (self.temperature**2)
-
     def get_config(self):
         """Get configuration for serialization."""
         return {
@@ -470,7 +484,8 @@ def _create_feature_extractor(self, model, layer_name):
                 f"Layer '{layer_name}' not found in model. "
                 f"This may happen with a subclassed model that cannot be "
                 f"traversed using the standard layer API. "
-                f"Available layers: {[layer.name for layer in model.layers]}"
+                f"Available layers: "
+                f"{[layer.name for layer in model.layers]}"
             )
 
         # Create a new model that extracts features from the specified layer.
@@ -509,47 +524,70 @@ def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
 
-        # For feature distillation, we need to ensure the features have
-        # compatible shapes for the chosen loss function
+        # Normalize outputs to lists
         if not isinstance(teacher_outputs, (list, tuple)):
             teacher_outputs = [teacher_outputs]
         if not isinstance(student_outputs, (list, tuple)):
             student_outputs = [student_outputs]
 
-        # Basic shape compatibility check
-        teacher_features = teacher_outputs[0]  # Use first output by default
-        student_features = student_outputs[0]  # Use first output by default
-
-        if len(teacher_features.shape) != len(student_features.shape):
-            raise ValueError(
-                f"Teacher and student features must have the same number of "
-                f"dimensions. "
-                f"Teacher shape: {teacher_features.shape}, "
-                f"Student shape: {student_features.shape}"
-            )
+        # For feature distillation, we need to validate layer compatibility
+        if (
+            self.teacher_layer_name is not None
+            and self.student_layer_name is not None
+        ):
+            # Validate that the specified layers exist and are compatible
+            self._validate_layer_compatibility(teacher_outputs, student_outputs)
+        else:
+            # If no specific layers are specified, validate final outputs
+            if len(teacher_outputs) != len(student_outputs):
+                raise ValueError(
+                    f"Teacher and student must have the same number of "
+                    f"outputs. "
+                    f"Teacher has {len(teacher_outputs)} outputs, "
+                    f"student has {len(student_outputs)} outputs."
+                )
 
-        # For MSE loss, shapes must match exactly
-        if self.loss_type == "mse":
-            if teacher_features.shape != student_features.shape:
+    def _validate_layer_compatibility(self, teacher_outputs, student_outputs):
+        """Validate that the specified layers are compatible for feature
+        distillation."""
+        # This method would be called by the distiller to validate layer
+        # compatibility when using feature distillation with specific layer
+        # names
+        pass
+
+    def validate_model_compatibility(self, teacher, student):
+        """Validate that teacher and student models are compatible for feature
+        distillation."""
+        # Check if specified layers exist in the models
+        if self.teacher_layer_name is not None:
+            if not self._layer_exists_in_model(
+                teacher, self.teacher_layer_name
+            ):
                 raise ValueError(
-                    f"For MSE loss, teacher and student features must have "
-                    f"identical shapes. Got teacher: {teacher_features.shape}, "
-                    f"student: {student_features.shape}. "
-                    f"Consider using 'cosine' loss type for different sizes "
-                    f"or add alignment layers to make features compatible."
+                    f"Teacher layer '{self.teacher_layer_name}' not found in "
+                    f"teacher model. "
+                    f"Available layers: "
+                    f"{[layer.name for layer in teacher.layers]}"
                 )
 
-        # For cosine loss, only last dimension needs to match (features)
-        elif self.loss_type == "cosine":
-            if teacher_features.shape[-1] != student_features.shape[-1]:
+        if self.student_layer_name is not None:
+            if not self._layer_exists_in_model(
+                student, self.student_layer_name
+            ):
                 raise ValueError(
-                    f"For cosine similarity loss, teacher and student features "
-                    f"must have the same feature dimension (last axis). "
-                    f"Got teacher: {teacher_features.shape[-1]}, "
-                    f"student: {student_features.shape[-1]}. "
-                    f"Consider adding a projection layer to align dimensions."
+                    f"Student layer '{self.student_layer_name}' not found in "
+                    f"student model. "
+                    f"Available layers: "
+                    f"{[layer.name for layer in student.layers]}"
                 )
 
+    def _layer_exists_in_model(self, model, layer_name):
+        """Check if a layer with the given name exists in the model."""
+        for layer in model.layers:
+            if layer.name == layer_name:
+                return True
+        return False
+
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute feature distillation loss using extracted features.
 
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 4408e85f6dc5..6ea022ef6dfb 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -45,13 +45,10 @@ def call(self, inputs, training=None):
 class TestLogitsDistillation(TestCase):
     """Essential test cases for LogitsDistillation strategy."""
 
-    def setUp(self):
-        """Set up test fixtures."""
-        super().setUp()
-        self.strategy = LogitsDistillation(temperature=2.0)
+    def test_logits_distillation_end_to_end(self):
+        """Test logits distillation loss computation end-to-end."""
+        strategy = LogitsDistillation(temperature=2.0)
 
-    def test_logits_distillation_loss(self):
-        """Test logits distillation loss computation."""
         # Create dummy logits with sufficient difference to ensure non-zero loss
         teacher_logits = ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
@@ -61,7 +58,7 @@ def test_logits_distillation_loss(self):
         )
 
         # Compute loss
-        loss = self.strategy.compute_loss(teacher_logits, student_logits)
+        loss = strategy.compute_loss(teacher_logits, student_logits)
 
         # Check that loss is a scalar tensor
         self.assertEqual(len(loss.shape), 0)
@@ -70,236 +67,40 @@ def test_logits_distillation_loss(self):
         self.assertTrue(ops.isfinite(loss))
         self.assertGreater(loss, 0.0)
 
-    def test_temperature_scaling(self):
-        """Test temperature scaling in logits distillation."""
-        # Create dummy logits with sufficient difference
-        teacher_logits = ops.convert_to_tensor(
-            np.array([[1.0, 2.0, 3.0]]), dtype="float32"
-        )
-        student_logits = ops.convert_to_tensor(
-            np.array([[2.0, 1.0, 4.0]]), dtype="float32"
-        )
-
-        # Test with different temperatures
-        temperatures = [1.0, 2.0, 4.0]
-        losses = []
-
-        for temp in temperatures:
-            strategy = LogitsDistillation(temperature=temp)
-            loss = strategy.compute_loss(teacher_logits, student_logits)
-            losses.append(loss)
-
-        # Higher temperature should result in different loss values
-        self.assertNotEqual(losses[0], losses[1])
-        self.assertNotEqual(losses[1], losses[2])
-
-
-@pytest.mark.requires_trainable_backend
-class TestLogitsDistillationComprehensive(TestCase):
-    """Comprehensive test cases for LogitsDistillation strategy."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        super().setUp()
-        self.strategy = LogitsDistillation(temperature=2.0)
-
-    def test_initialization(self):
-        """Test LogitsDistillation initialization."""
-        # Test default initialization (no temperature specified)
-        strategy = LogitsDistillation()
-        self.assertEqual(strategy.temperature, 3.0)  # Default fallback
-        self.assertEqual(strategy.loss_type, "kl_divergence")
-        self.assertEqual(strategy.output_index, 0)
-
-        # Test custom initialization
-        strategy = LogitsDistillation(
-            temperature=5.0,
-            loss_type="categorical_crossentropy",
-            output_index=1,
-        )
-        self.assertEqual(strategy.temperature, 5.0)
-        self.assertEqual(strategy.loss_type, "categorical_crossentropy")
-        self.assertEqual(strategy.output_index, 1)
-
-    def test_invalid_loss_type(self):
-        """Test that invalid loss types raise ValueError."""
-        with self.assertRaises(ValueError):
-            LogitsDistillation(loss_type="invalid_loss")
-
-    def test_temperature_configuration(self):
-        """Test that temperature is properly configured."""
-        # Create strategy with explicit temperature
-        strategy = LogitsDistillation(temperature=4.0)
-        self.assertEqual(strategy.temperature, 4.0)
-
-        # Create strategy with default temperature
-        strategy_default = LogitsDistillation()
-        self.assertEqual(strategy_default.temperature, 3.0)
-
-    def test_logits_distillation_loss_kl_divergence(self):
-        """Test logits distillation loss computation with KL divergence."""
-        strategy = LogitsDistillation(
+    def test_logits_distillation_with_different_loss_types(self):
+        """Test logits distillation with different loss types."""
+        # Test KL divergence
+        strategy_kl = LogitsDistillation(
             temperature=2.0, loss_type="kl_divergence"
         )
-
         teacher_logits = ops.convert_to_tensor(
-            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+            np.array([[1.0, 2.0, 3.0]]), dtype="float32"
         )
         student_logits = ops.convert_to_tensor(
-            np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
+            np.array([[2.0, 1.0, 4.0]]), dtype="float32"
         )
 
-        # Compute loss
-        loss = strategy.compute_loss(teacher_logits, student_logits)
-
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
-
-        # Check that loss is finite and positive
-        self.assertTrue(ops.isfinite(loss))
-        self.assertGreater(loss, 0.0)
+        loss_kl = strategy_kl.compute_loss(teacher_logits, student_logits)
+        self.assertTrue(ops.isfinite(loss_kl))
+        self.assertGreater(loss_kl, 0.0)
 
-    def test_logits_distillation_loss_categorical_crossentropy(self):
-        """Test logits distillation loss with categorical crossentropy."""
-        strategy = LogitsDistillation(
+        # Test categorical crossentropy
+        strategy_ce = LogitsDistillation(
             temperature=2.0, loss_type="categorical_crossentropy"
         )
-
-        teacher_logits = ops.convert_to_tensor(
-            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
-        )
-        student_logits = ops.convert_to_tensor(
-            np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
-        )
-
-        # Compute loss
-        loss = strategy.compute_loss(teacher_logits, student_logits)
-
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
-
-        # Check that loss is finite and positive
-        self.assertTrue(ops.isfinite(loss))
-        self.assertGreater(loss, 0.0)
-
-    def test_multi_output_support(self):
-        """Test support for multi-output models."""
-        # Create dummy multi-output logits
-        teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[4.0, 5.0]]), dtype="float32"),
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(np.array([[1.1, 2.1, 3.1]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[4.1, 5.1]]), dtype="float32"),
-        ]
-
-        # Test with output_index=0
-        strategy = LogitsDistillation(temperature=2.0, output_index=0)
-        loss = strategy.compute_loss(teacher_outputs, student_outputs)
-        self.assertTrue(ops.isfinite(loss))
-
-        # Test with output_index=1
-        strategy = LogitsDistillation(temperature=2.0, output_index=1)
-        loss = strategy.compute_loss(teacher_outputs, student_outputs)
-        self.assertTrue(ops.isfinite(loss))
-
-    def test_output_validation(self):
-        """Test output validation."""
-        strategy = LogitsDistillation(temperature=2.0, output_index=0)
-
-        # Test with compatible outputs
-        teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
-        ]
-
-        # Should not raise an error
-        strategy.validate_outputs(teacher_outputs, student_outputs)
-
-        # Test with incompatible output shapes
-        teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(
-                np.array([[1.0, 2.0]]), dtype="float32"
-            )  # Different number of classes
-        ]
-
-        with self.assertRaises(ValueError):
-            strategy.validate_outputs(teacher_outputs, student_outputs)
-
-        # Test with invalid output index
-        strategy = LogitsDistillation(
-            temperature=2.0, output_index=1
-        )  # Invalid index
-        teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
-        ]
-
-        with self.assertRaises(ValueError):
-            strategy.validate_outputs(teacher_outputs, student_outputs)
-
-    def test_get_config(self):
-        """Test get_config method."""
-        strategy = LogitsDistillation(
-            temperature=3.0,
-            loss_type="categorical_crossentropy",
-            output_index=1,
-        )
-        config = strategy.get_config()
-        expected_config = {
-            "temperature": 3.0,
-            "loss_type": "categorical_crossentropy",
-            "output_index": 1,
-        }
-        self.assertEqual(config, expected_config)
-
-    def test_serialization(self):
-        """Test strategy serialization and deserialization."""
-        original_strategy = LogitsDistillation(
-            temperature=4.0,
-            loss_type="categorical_crossentropy",
-            output_index=1,
-        )
-        config = original_strategy.get_config()
-        reconstructed_strategy = LogitsDistillation.from_config(config)
-
-        self.assertEqual(
-            original_strategy.temperature, reconstructed_strategy.temperature
-        )
-        self.assertEqual(
-            original_strategy.loss_type, reconstructed_strategy.loss_type
-        )
-        self.assertEqual(
-            original_strategy.output_index, reconstructed_strategy.output_index
-        )
-
-        # Test config matches expected
-        expected_config = {
-            "temperature": 4.0,
-            "loss_type": "categorical_crossentropy",
-            "output_index": 1,
-        }
-        self.assertEqual(config, expected_config)
+        loss_ce = strategy_ce.compute_loss(teacher_logits, student_logits)
+        self.assertTrue(ops.isfinite(loss_ce))
+        self.assertGreater(loss_ce, 0.0)
 
 
 @pytest.mark.requires_trainable_backend
 class TestFeatureDistillation(TestCase):
-    """Test cases for FeatureDistillation strategy."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        super().setUp()
+    """Essential test cases for FeatureDistillation strategy."""
 
+    def test_feature_distillation_end_to_end(self):
+        """Test feature distillation end-to-end."""
         # Create models with named layers for feature extraction
-        self.teacher = keras.Sequential(
+        teacher = keras.Sequential(
             [
                 keras.layers.Dense(
                     64, activation="relu", name="teacher_dense_1"
@@ -311,7 +112,7 @@ def setUp(self):
             ]
         )
 
-        self.student = keras.Sequential(
+        student = keras.Sequential(
             [
                 keras.layers.Dense(
                     32, activation="relu", name="student_dense_1"
@@ -323,188 +124,18 @@ def setUp(self):
             ]
         )
 
-        # Create a complex model with residual connections for testing
-        inputs = keras.layers.Input(shape=(20,), name="input")
-        x = keras.layers.Dense(64, activation="relu", name="dense_1")(inputs)
-        residual = keras.layers.Dense(64, name="residual_projection")(inputs)
-        x = keras.layers.Add(name="residual_add")([x, residual])
-        x = keras.layers.Dense(32, activation="relu", name="dense_2")(x)
-        outputs = keras.layers.Dense(10, name="output")(x)
-
-        self.complex_model = keras.Model(
-            inputs=inputs, outputs=outputs, name="complex_model"
-        )
+        # Build models
+        dummy_input = np.random.random((1, 10)).astype(np.float32)
+        _ = teacher(dummy_input)
+        _ = student(dummy_input)
 
-    def test_initialization(self):
-        """Test FeatureDistillation initialization."""
-        # Test default initialization
-        strategy = FeatureDistillation()
-        self.assertEqual(strategy.loss_type, "mse")
-        self.assertIsNone(strategy.teacher_layer_name)
-        self.assertIsNone(strategy.student_layer_name)
-        self.assertIsNone(strategy._teacher_feature_model)
-        self.assertIsNone(strategy._student_feature_model)
-
-        # Test custom initialization
-        strategy = FeatureDistillation(
-            loss_type="cosine",
-            teacher_layer_name="dense_1",
-            student_layer_name="dense_1",
-        )
-        self.assertEqual(strategy.loss_type, "cosine")
-        self.assertEqual(strategy.teacher_layer_name, "dense_1")
-        self.assertEqual(strategy.student_layer_name, "dense_1")
-
-    def test_invalid_loss_type(self):
-        """Test that invalid loss types raise ValueError."""
-        with self.assertRaises(ValueError):
-            FeatureDistillation(loss_type="invalid_loss")
-
-    def test_create_feature_extractor_with_layer_name(self):
-        """Test feature extractor creation with specific layer name."""
-        strategy = FeatureDistillation(
+        # Test MSE loss
+        strategy_mse = FeatureDistillation(
+            loss_type="mse",
             teacher_layer_name="teacher_dense_1",
             student_layer_name="student_dense_1",
         )
 
-        # Build the models first (needed for Sequential models)
-        dummy_input = np.random.random((1, 10)).astype(np.float32)
-        _ = self.teacher(dummy_input)
-        _ = self.student(dummy_input)
-
-        # Test teacher feature extractor creation
-        teacher_feature_extractor = strategy._create_feature_extractor(
-            self.teacher, "teacher_dense_1"
-        )
-        self.assertIsInstance(teacher_feature_extractor, keras.Model)
-        self.assertEqual(
-            teacher_feature_extractor.name,
-            f"{self.teacher.name}_features_teacher_dense_1",
-        )
-
-        # Test student feature extractor creation
-        student_feature_extractor = strategy._create_feature_extractor(
-            self.student, "student_dense_1"
-        )
-        self.assertIsInstance(student_feature_extractor, keras.Model)
-        self.assertEqual(
-            student_feature_extractor.name,
-            f"{self.student.name}_features_student_dense_1",
-        )
-
-    def test_create_feature_extractor_without_layer_name(self):
-        """Test feature model creation without layer name (returns original)."""
-        strategy = FeatureDistillation()
-
-        # Should return original model when no layer name specified
-        feature_model = strategy._create_feature_extractor(self.teacher, None)
-        self.assertIs(feature_model, self.teacher)
-
-    def test_create_feature_extractor_invalid_layer_name(self):
-        """Test that invalid layer names raise ValueError."""
-        strategy = FeatureDistillation()
-
-        with self.assertRaises(ValueError) as cm:
-            strategy._create_feature_extractor(
-                self.teacher, "nonexistent_layer"
-            )
-
-        self.assertIn(
-            "Layer 'nonexistent_layer' not found in model", str(cm.exception)
-        )
-        self.assertIn("Available layers:", str(cm.exception))
-
-    def test_complex_model_feature_extraction(self):
-        """Test feature extraction with complex model topologies."""
-        strategy = FeatureDistillation(
-            teacher_layer_name="dense_1", student_layer_name="dense_1"
-        )
-
-        # Test with complex model with residual connections
-        x = np.random.random((2, 20)).astype(np.float32)
-
-        # This should work with the robust implementation
-        feature_extractor = strategy._create_feature_extractor(
-            self.complex_model, "dense_1"
-        )
-        self.assertIsInstance(feature_extractor, keras.Model)
-
-        # Test that it actually extracts features correctly
-        features = feature_extractor(x)
-        self.assertEqual(features.shape, (2, 64))  # dense_1 output size
-
-        # Verify it's different from final output
-        full_output = self.complex_model(x)
-        self.assertEqual(full_output.shape, (2, 10))  # final output size
-        self.assertNotEqual(features.shape, full_output.shape)
-
-    def test_residual_connection_feature_extraction(self):
-        """Test feature extraction from residual add layer."""
-        from keras import ops
-
-        strategy = FeatureDistillation()
-
-        x = np.random.random((2, 20)).astype(np.float32)
-
-        # Extract features from the residual add layer
-        residual_extractor = strategy._create_feature_extractor(
-            self.complex_model, "residual_add"
-        )
-
-        residual_features = residual_extractor(x)
-        self.assertEqual(residual_features.shape, (2, 64))  # After residual add
-
-        # Verify it's working correctly by comparing with manual computation
-        dense_1_extractor = strategy._create_feature_extractor(
-            self.complex_model, "dense_1"
-        )
-        dense_1_features = dense_1_extractor(x)
-
-        # The residual features should be different from just dense_1
-        # (since they include the residual connection)
-        self.assertEqual(dense_1_features.shape, residual_features.shape)
-        # They should be different values due to the residual connection
-        # Use keras.ops for JAX compatibility
-        dense_1_array = ops.convert_to_numpy(dense_1_features)
-        residual_array = ops.convert_to_numpy(residual_features)
-        self.assertFalse(np.allclose(dense_1_array, residual_array))
-
-    def test_get_teacher_features(self):
-        """Test teacher feature extraction."""
-        strategy = FeatureDistillation(teacher_layer_name="teacher_dense_1")
-
-        # Create dummy input
-        x = np.random.random((2, 10)).astype(np.float32)
-
-        # Get features
-        features = strategy._get_teacher_features(self.teacher, x)
-
-        # Check that features have the expected shape (after first dense layer)
-        self.assertEqual(features.shape, (2, 64))  # batch_size, hidden_dim
-
-        # Check that feature model was created and cached
-        self.assertIsNotNone(strategy._teacher_feature_model)
-
-    def test_get_student_features(self):
-        """Test student feature extraction."""
-        strategy = FeatureDistillation(student_layer_name="student_dense_1")
-
-        # Create dummy input
-        x = np.random.random((2, 10)).astype(np.float32)
-
-        # Get features
-        features = strategy._get_student_features(self.student, x)
-
-        # Check that features have the expected shape (after first dense layer)
-        self.assertEqual(features.shape, (2, 32))  # batch_size, hidden_dim
-
-        # Check that feature model was created and cached
-        self.assertIsNotNone(strategy._student_feature_model)
-
-    def test_feature_distillation_loss_mse(self):
-        """Test feature distillation loss computation with MSE."""
-        strategy = FeatureDistillation(loss_type="mse")
-
         teacher_features = ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
@@ -512,131 +143,40 @@ def test_feature_distillation_loss_mse(self):
             np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
         )
 
-        # Compute loss
-        loss = strategy.compute_loss(teacher_features, student_features)
-
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
-
-        # Check that loss is finite and positive
-        self.assertTrue(ops.isfinite(loss))
-        self.assertGreater(loss, 0.0)
-
-    def test_feature_distillation_loss_cosine(self):
-        """Test feature distillation loss computation with cosine similarity."""
-        strategy = FeatureDistillation(loss_type="cosine")
+        loss_mse = strategy_mse.compute_loss(teacher_features, student_features)
+        self.assertEqual(len(loss_mse.shape), 0)
+        self.assertTrue(ops.isfinite(loss_mse))
+        self.assertGreater(loss_mse, 0.0)
 
-        teacher_features = ops.convert_to_tensor(
-            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
-        )
-        student_features = ops.convert_to_tensor(
-            np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
+        # Test cosine loss
+        strategy_cosine = FeatureDistillation(loss_type="cosine")
+        loss_cosine = strategy_cosine.compute_loss(
+            teacher_features, student_features
         )
-
-        # Compute loss
-        loss = strategy.compute_loss(teacher_features, student_features)
-
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
-
-        # Check that loss is finite and non-negative (cosine distance)
-        self.assertTrue(ops.isfinite(loss))
-        self.assertGreaterEqual(loss, 0.0)
-
-    def test_get_config(self):
-        """Test configuration serialization."""
-        strategy = FeatureDistillation(
-            loss_type="cosine",
-            teacher_layer_name="teacher_layer",
-            student_layer_name="student_layer",
-        )
-        config = strategy.get_config()
-        expected_config = {
-            "loss_type": "cosine",
-            "teacher_layer_name": "teacher_layer",
-            "student_layer_name": "student_layer",
-        }
-        self.assertEqual(config, expected_config)
-
-    def test_serialization(self):
-        """Test strategy serialization and deserialization."""
-        original_strategy = FeatureDistillation(
-            loss_type="cosine",
-            teacher_layer_name="teacher_layer",
-            student_layer_name="student_layer",
-        )
-        config = original_strategy.get_config()
-        reconstructed_strategy = FeatureDistillation.from_config(config)
-
-        self.assertEqual(
-            original_strategy.loss_type, reconstructed_strategy.loss_type
-        )
-        self.assertEqual(
-            original_strategy.teacher_layer_name,
-            reconstructed_strategy.teacher_layer_name,
-        )
-        self.assertEqual(
-            original_strategy.student_layer_name,
-            reconstructed_strategy.student_layer_name,
-        )
-
-        # Test config matches expected
-        expected_config = {
-            "loss_type": "cosine",
-            "teacher_layer_name": "teacher_layer",
-            "student_layer_name": "student_layer",
-        }
-        self.assertEqual(config, expected_config)
+        self.assertEqual(len(loss_cosine.shape), 0)
+        self.assertTrue(ops.isfinite(loss_cosine))
+        self.assertGreaterEqual(loss_cosine, 0.0)
 
 
 @pytest.mark.requires_trainable_backend
 class TestMultiOutputDistillation(TestCase):
-    """Comprehensive test cases for MultiOutputDistillation strategy."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        super().setUp()
+    """Essential test cases for MultiOutputDistillation strategy."""
 
+    def test_multi_output_distillation_end_to_end(self):
+        """Test multi-output distillation end-to-end."""
         # Create strategies for different outputs
-        self.logits_strategy = LogitsDistillation(
-            temperature=2.0, output_index=0
-        )
-        self.feature_strategy = FeatureDistillation(loss_type="mse")
+        logits_strategy = LogitsDistillation(temperature=2.0, output_index=0)
+        feature_strategy = FeatureDistillation(loss_type="mse")
 
         # Create multi-output strategy
-        self.strategy = MultiOutputDistillation(
-            output_strategies={
-                0: self.logits_strategy,
-                1: self.feature_strategy,
-            },
-            weights={0: 1.0, 1: 0.5},
-        )
-
-    def test_initialization(self):
-        """Test MultiOutputDistillation initialization."""
-        # Test with explicit weights
         strategy = MultiOutputDistillation(
             output_strategies={
-                0: self.logits_strategy,
-                1: self.feature_strategy,
+                0: logits_strategy,
+                1: feature_strategy,
             },
-            weights={0: 2.0, 1: 1.0},
-        )
-        self.assertEqual(strategy.weights[0], 2.0)
-        self.assertEqual(strategy.weights[1], 1.0)
-
-        # Test with default weights (should be 1.0 for all)
-        strategy = MultiOutputDistillation(
-            output_strategies={
-                0: self.logits_strategy,
-                1: self.feature_strategy,
-            }
+            weights={0: 1.0, 1: 0.5},
         )
-        self.assertEqual(strategy.weights[0], 1.0)
-        self.assertEqual(strategy.weights[1], 1.0)
 
-    def test_multi_output_loss_computation(self):
-        """Test multi-output distillation loss computation."""
         # Create dummy multi-output data
         teacher_outputs = [
             ops.convert_to_tensor(
@@ -656,7 +196,7 @@ def test_multi_output_loss_computation(self):
         ]
 
         # Compute loss
-        loss = self.strategy.compute_loss(teacher_outputs, student_outputs)
+        loss = strategy.compute_loss(teacher_outputs, student_outputs)
 
         # Check that loss is a scalar tensor
         self.assertEqual(len(loss.shape), 0)
@@ -665,79 +205,6 @@ def test_multi_output_loss_computation(self):
         self.assertTrue(ops.isfinite(loss))
         self.assertGreater(loss, 0.0)
 
-    def test_output_validation(self):
-        """Test output validation for multi-output distillation."""
-        # Test with valid outputs
-        teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
-        ]
-
-        # Should not raise an error
-        self.strategy.validate_outputs(teacher_outputs, student_outputs)
-
-        # Test with insufficient teacher outputs
-        teacher_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32")
-            # Missing second output
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(np.array([[1.0, 2.0, 3.0]]), dtype="float32"),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
-        ]
-
-        with self.assertRaises(ValueError):
-            self.strategy.validate_outputs(teacher_outputs, student_outputs)
-
-    def test_weight_application(self):
-        """Test that weights are properly applied."""
-        # Create strategies with known behavior
-        strategy1 = MultiOutputDistillation(
-            output_strategies={
-                0: self.logits_strategy,
-                1: self.feature_strategy,
-            },
-            weights={0: 1.0, 1: 1.0},  # Equal weights
-        )
-
-        strategy2 = MultiOutputDistillation(
-            output_strategies={
-                0: self.logits_strategy,
-                1: self.feature_strategy,
-            },
-            weights={0: 2.0, 1: 1.0},  # Different weights
-        )
-
-        # Create test data
-        teacher_outputs = [
-            ops.convert_to_tensor(
-                np.array([[10.0, 20.0, 30.0]]), dtype="float32"
-            ),
-            ops.convert_to_tensor(np.array([[0.1, 0.2]]), dtype="float32"),
-        ]
-        student_outputs = [
-            ops.convert_to_tensor(
-                np.array([[5.0, 15.0, 25.0]]), dtype="float32"
-            ),
-            ops.convert_to_tensor(np.array([[0.15, 0.25]]), dtype="float32"),
-        ]
-
-        # Compute losses
-        loss1 = strategy1.compute_loss(teacher_outputs, student_outputs)
-        loss2 = strategy2.compute_loss(teacher_outputs, student_outputs)
-
-        # Losses should be different due to different weights, but may be
-        # very close
-        # Just verify that both losses are finite and positive
-        self.assertTrue(ops.isfinite(loss1))
-        self.assertTrue(ops.isfinite(loss2))
-        self.assertGreater(loss1, 0.0)
-        self.assertGreater(loss2, 0.0)
-
     def test_end_to_end_with_multi_output_models(self):
         """Test end-to-end training with multi-output models."""
         from keras.src.distillation.distiller import Distiller
@@ -746,6 +213,10 @@ def test_end_to_end_with_multi_output_models(self):
         teacher = MultiOutputTeacher(vocab_size=10, hidden_dim=32)
         student = MultiOutputStudent(vocab_size=10, hidden_dim=16)
 
+        # Build models before creating the distiller
+        teacher.build((None, 5))
+        student.build((None, 5))
+
         # Create multi-output distillation strategy
         multi_strategy = MultiOutputDistillation(
             output_strategies={
@@ -759,7 +230,7 @@ def test_end_to_end_with_multi_output_models(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=[multi_strategy],
+            strategy=multi_strategy,
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
             student_loss=[
@@ -811,7 +282,7 @@ def test_serialization(self):
             weights={0: 1.0, 1: 0.5},
         )
 
-        # Test get_config (this was the critical bug)
+        # Test get_config
         config = multi_strategy.get_config()
 
         # Verify structure
@@ -819,7 +290,7 @@ def test_serialization(self):
         self.assertIn("weights", config)
         self.assertEqual(config["weights"], {0: 1.0, 1: 0.5})
 
-        # Test JSON serialization (this was failing before the fix)
+        # Test JSON serialization
         json_str = json.dumps(config)
         self.assertIsInstance(json_str, str)
 

From a109178f34a2ad808ddfbad2658c5199a0d3c1e4 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 18 Aug 2025 15:12:51 -0700
Subject: [PATCH 15/31] code reformat

---
 keras/src/distillation/strategies.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 9318ec8f74bd..7ee938014d25 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -172,7 +172,7 @@ def __init__(
             raise ValueError(
                 "temperature must be > 0. Set a positive value (e.g., 1-10)."
             )
- 
+
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)

From 5b6bf036408d82bcca5b959877aefab603f5546e Mon Sep 17 00:00:00 2001
From: divyashreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 25 Aug 2025 21:54:31 +0000
Subject: [PATCH 16/31] clean up

---
 keras/src/distillation/distiller.py       |  8 ++------
 keras/src/distillation/distiller_test.py  |  9 ++++-----
 keras/src/distillation/strategies.py      | 12 ++++--------
 keras/src/distillation/strategies_test.py |  5 +++--
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 27063807c492..28bc6f8dc075 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -1,6 +1,8 @@
 import keras
+from keras.src import ops
 from keras.src.api_export import keras_export
 from keras.src.models.model import Model
+from keras.src.saving import serialization_lib
 
 
 @keras_export("keras.distillation.Distiller")
@@ -594,8 +596,6 @@ def _custom_distillation_loss(self, teacher_outputs,
                 )
 
             # Ensure student_loss is a scalar
-            from keras import ops
-
             if hasattr(student_loss, "shape") and len(student_loss.shape) > 0:
                 student_loss = ops.mean(student_loss)
 
@@ -611,8 +611,6 @@ def _custom_distillation_loss(self, teacher_outputs,
             )
 
             # Ensure distillation_loss is a scalar
-            from keras import ops
-
             if (
                 hasattr(distillation_loss, "shape")
                 and len(distillation_loss.shape) > 0
@@ -650,7 +648,6 @@ def metrics(self):
 
     def get_config(self):
         """Get configuration for serialization."""
-        from keras.src.saving import serialization_lib
 
         config = super().get_config()
         config.update(
@@ -674,7 +671,6 @@ def get_config(self):
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
-        from keras.src.saving import serialization_lib
 
         config["teacher"] = serialization_lib.deserialize_keras_object(
             config["teacher"]
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 6a80026532f6..5186357062d8 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -1,3 +1,7 @@
+import json
+import os
+import tempfile
+
 import numpy as np
 import pytest
 
@@ -334,9 +338,6 @@ def test_get_student_model_method(self):
 
     def test_distiller_serialization_and_saving(self):
         """Test Distiller serialization, saving, and loading."""
-        import json
-        import os
-        import tempfile
 
         # Use standard Sequential models for serialization testing
         teacher = keras.Sequential(
@@ -364,8 +365,6 @@ def test_distiller_serialization_and_saving(self):
         )
 
         # Create distiller with single strategy
-        from keras.src.distillation.strategies import LogitsDistillation
-
         strategy = LogitsDistillation(
             temperature=3.0, loss_type="kl_divergence"
         )
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 7ee938014d25..5f5408a20485 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -1,5 +1,7 @@
 import keras
+from keras.src import ops
 from keras.src.api_export import keras_export
+from keras.src.saving import serialization_lib
 
 
 @keras_export("keras.distillation.BaseDistillationStrategy")
@@ -107,6 +109,7 @@ class LogitsDistillation(BaseDistillationStrategy):
     Example:
 
     ```python
+    from keras import ops
     # Basic logits distillation
     strategy = LogitsDistillation(temperature=3.0)
 
@@ -119,7 +122,6 @@ class LogitsDistillation(BaseDistillationStrategy):
     # Custom loss by subclassing
     class CustomLogitsDistillation(LogitsDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            from keras import ops
             # Get the outputs to distill
             teacher_logits = teacher_outputs[self.output_index]
             student_logits = student_outputs[self.output_index]
@@ -219,7 +221,6 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         Returns:
             Distillation loss tensor.
         """
-        from keras import ops
 
         # Normalize outputs to lists
         if not isinstance(teacher_outputs, (list, tuple)):
@@ -347,7 +348,6 @@ class FeatureDistillation(BaseDistillationStrategy):
     # Custom loss by subclassing
     class CustomFeatureDistillation(FeatureDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            from keras import ops
             # Use first output by default
             teacher_features = teacher_outputs[0]
             student_features = student_outputs[0]
@@ -605,7 +605,6 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         Returns:
             Feature distillation loss tensor.
         """
-        from keras import ops
 
         # Normalize outputs to lists
         if not isinstance(teacher_outputs, (list, tuple)):
@@ -729,7 +728,6 @@ class MultiOutputDistillation(BaseDistillationStrategy):
     # Custom multi-output strategy
     class CustomMultiOutputDistillation(MultiOutputDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            from keras import ops
             # Get the outputs to distill
             teacher_logits = teacher_outputs[0]
             student_logits = student_outputs[0]
@@ -747,7 +745,6 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
     class CustomFeatureDistillation(FeatureDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            from keras import ops
             teacher_features = teacher_outputs[0]
             student_features = student_outputs[0]
             return ops.mean(ops.abs(teacher_features - student_features))
@@ -843,7 +840,6 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
     def get_config(self):
         """Get configuration for serialization."""
-        from keras.src.saving import serialization_lib
 
         return {
             "output_strategies": {
@@ -856,7 +852,7 @@ def get_config(self):
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
-        from keras.src.saving import serialization_lib
+
 
         # JSON keys must be strings, so we convert them back to int
         config["output_strategies"] = {
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 6ea022ef6dfb..68c6ba5ca7c9 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -1,8 +1,11 @@
+import json
+
 import numpy as np
 import pytest
 
 import keras
 from keras import ops
+from keras.src.distillation.distiller import Distiller
 from keras.src.distillation.strategies import FeatureDistillation
 from keras.src.distillation.strategies import LogitsDistillation
 from keras.src.distillation.strategies import MultiOutputDistillation
@@ -207,7 +210,6 @@ def test_multi_output_distillation_end_to_end(self):
 
     def test_end_to_end_with_multi_output_models(self):
         """Test end-to-end training with multi-output models."""
-        from keras.src.distillation.distiller import Distiller
 
         # Create multi-output models
         teacher = MultiOutputTeacher(vocab_size=10, hidden_dim=32)
@@ -271,7 +273,6 @@ def test_end_to_end_with_multi_output_models(self):
 
     def test_serialization(self):
         """Test MultiOutputDistillation serialization and deserialization."""
-        import json
 
         # Create nested strategies
         strategy1 = LogitsDistillation(temperature=3.0, output_index=0)

From de73fa648ed607bb23cbbdfae972af57c89529fa Mon Sep 17 00:00:00 2001
From: divyashreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 25 Aug 2025 21:59:36 +0000
Subject: [PATCH 17/31] code reformat

---
 keras/src/distillation/strategies.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 5f5408a20485..33f01f6fcb43 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -853,7 +853,6 @@ def get_config(self):
     def from_config(cls, config):
         """Create instance from configuration."""
 
-
         # JSON keys must be strings, so we convert them back to int
         config["output_strategies"] = {
             int(k): serialization_lib.deserialize_keras_object(v)

From 5cd56bf288dbc6e072a734828b5b63e950811d8e Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Thu, 28 Aug 2025 15:36:22 -0700
Subject: [PATCH 18/31] remove multi output distiilation

---
 .../_tf_keras/keras/distillation/__init__.py  |   3 -
 keras/api/distillation/__init__.py            |   3 -
 keras/src/distillation/distiller.py           | 151 +++++++++++++-----
 keras/src/distillation/distiller_test.py      |  72 ++++++++-
 keras/src/distillation/strategies_test.py     |  59 ++++---
 5 files changed, 205 insertions(+), 83 deletions(-)

diff --git a/keras/api/_tf_keras/keras/distillation/__init__.py b/keras/api/_tf_keras/keras/distillation/__init__.py
index 95ce52c2dfd6..48dd038b3f3c 100644
--- a/keras/api/_tf_keras/keras/distillation/__init__.py
+++ b/keras/api/_tf_keras/keras/distillation/__init__.py
@@ -14,6 +14,3 @@
 from keras.src.distillation.strategies import (
     LogitsDistillation as LogitsDistillation,
 )
-from keras.src.distillation.strategies import (
-    MultiOutputDistillation as MultiOutputDistillation,
-)
diff --git a/keras/api/distillation/__init__.py b/keras/api/distillation/__init__.py
index 95ce52c2dfd6..48dd038b3f3c 100644
--- a/keras/api/distillation/__init__.py
+++ b/keras/api/distillation/__init__.py
@@ -14,6 +14,3 @@
 from keras.src.distillation.strategies import (
     LogitsDistillation as LogitsDistillation,
 )
-from keras.src.distillation.strategies import (
-    MultiOutputDistillation as MultiOutputDistillation,
-)
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 28bc6f8dc075..cefec92489e3 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -53,9 +53,10 @@ class Distiller(Model):
       learn better internal representations. Often leads to better performance
       than logits-only.
 
-    - `MultiOutputDistillation`: For models with multiple outputs (e.g.,
-      object detection with classification and regression heads). Allows
-      different distillation strategies for different outputs.
+    - Multiple Strategies: For models with multiple outputs (e.g., object
+      detection with classification and regression heads), pass a list of
+      strategies with corresponding weights. Each strategy will be applied to
+      its corresponding output.
 
     - Custom Strategies: Create custom strategies by subclassing
       `BaseDistillationStrategy` and overriding the `compute_loss` method.
@@ -66,9 +67,14 @@ class Distiller(Model):
         student: A `keras.Model` to be trained through distillation. This model
             will learn from both ground truth labels and the teacher's
             predictions.
-        strategy: Distillation strategy to apply. Can be `LogitsDistillation`,
-            `FeatureDistillation`, `MultiOutputDistillation`, or a custom
+        strategy: Single distillation strategy to apply. Can be
+            `LogitsDistillation`, `FeatureDistillation`, or a custom strategy.
+            Use `strategies` for multiple strategies.
+        strategies: List of distillation strategies to apply. Each strategy will
+            be applied to its corresponding output. Use `strategy` for a single
             strategy.
+        strategy_weights: List of weights for each strategy. Must have the same
+            length as `strategies`. If None, equal weights are used.
         student_loss_weight: Weight for the student's supervised loss component.
             Must be between 0 and 1. Higher values emphasize ground truth
             labels, lower values emphasize teacher predictions. Defaults to 0.5.
@@ -107,26 +113,25 @@ class Distiller(Model):
     # Train the distiller
     distiller.fit(x_train, y_train, epochs=10, validation_split=0.2)
 
-    # Get the trained student model
-    trained_student = distiller.get_student_model()
+    # Access the trained student model
+    trained_student = distiller.student_model
     ```
 
     For multi-output models:
 
     ```python
-    # Create multi-output strategy
-    multi_strategy = MultiOutputDistillation(
-        output_strategies={
-            0: LogitsDistillation(temperature=3.0, output_index=0),
-            1: LogitsDistillation(temperature=2.0, output_index=1)
-        },
-        weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
-    )
+    # Create multiple strategies for different outputs
+    strategies = [
+        LogitsDistillation(temperature=3.0, output_index=0),
+        LogitsDistillation(temperature=2.0, output_index=1)
+    ]
+    strategy_weights = [1.0, 0.5]  # Weight classification more heavily
 
     distiller = Distiller(
         teacher=teacher,
         student=student,
-        strategy=multi_strategy,
+        strategies=strategies,
+        strategy_weights=strategy_weights,
         student_loss_weight=0.5,
         optimizer='adam',
         student_loss=['sparse_categorical_crossentropy', 'mse']
@@ -138,7 +143,9 @@ def __init__(
         self,
         teacher,
         student,
-        strategy,
+        strategy=None,
+        strategies=None,
+        strategy_weights=None,
         student_loss_weight=0.5,
         optimizer="adam",
         student_loss="sparse_categorical_crossentropy",
@@ -214,17 +221,51 @@ def __init__(
         # Validate architecture compatibility for feature distillation
         self._validate_architecture_compatibility(teacher, student)
 
-        # Store strategy (single strategy only)
-        if strategy is None:
+        # Handle strategy configuration
+        if strategy is not None and strategies is not None:
+            raise ValueError(
+                "Cannot specify both 'strategy' and 'strategies'. "
+                "Use 'strategy' for single strategy or 'strategies' for "
+                "multiple strategies."
+            )
+
+        if strategy is not None:
+            # Single strategy mode
+            self.strategies = [strategy]
+            self.strategy_weights = [1.0]
+            self.single_strategy = True
+        elif strategies is not None:
+            # Multiple strategies mode
+            if not isinstance(strategies, (list, tuple)):
+                raise ValueError(
+                    f"strategies must be a list or tuple, got "
+                    f"{type(strategies)}"
+                )
+
+            self.strategies = strategies
+
+            # Set default weights if not provided
+            if strategy_weights is None:
+                self.strategy_weights = [1.0] * len(strategies)
+            else:
+                if len(strategy_weights) != len(strategies):
+                    raise ValueError(
+                        f"Number of strategy_weights ({len(strategy_weights)}) "
+                        f"must match number of strategies ({len(strategies)})"
+                    )
+                self.strategy_weights = strategy_weights
+
+            self.single_strategy = False
+        else:
             raise ValueError(
-                "Distillation strategy cannot be None. "
+                "Must specify either 'strategy' or 'strategies'. "
                 "Please provide a valid strategy such as LogitsDistillation, "
-                "FeatureDistillation, or MultiOutputDistillation."
+                "FeatureDistillation, or a list of strategies."
             )
-        self.strategy = strategy
 
         # Validate strategy-specific compatibility
-        self._validate_strategy_compatibility(teacher, student)
+        for strategy in self.strategies:
+            self._validate_strategy_compatibility(teacher, student, strategy)
 
         # Freeze teacher model
         self.teacher.trainable = False
@@ -398,11 +439,11 @@ def _validate_architecture_compatibility(self, teacher, student):
         # that require specific architectural compatibility
         pass
 
-    def _validate_strategy_compatibility(self, teacher, student):
+    def _validate_strategy_compatibility(self, teacher, student, strategy):
         """Validate that the strategy is compatible with the teacher and student
         models."""
-        if hasattr(self.strategy, "validate_model_compatibility"):
-            self.strategy.validate_model_compatibility(teacher, student)
+        if hasattr(strategy, "validate_model_compatibility"):
+            strategy.validate_model_compatibility(teacher, student)
 
     def _shapes_are_compatible(self, shape1, shape2):
         """Check if two shapes are compatible (allowing for batch dimension
@@ -432,12 +473,13 @@ def _shapes_are_compatible(self, shape1, shape2):
                 return False
         return True
 
-    def get_student_model(self):
-        """Get the trained student model for independent use.
+    @property
+    def student_model(self):
+        """The trained student model for independent use.
 
-        This method returns the student model that has been trained through
-        the distillation process. The returned model can be used independently
-        for inference, further training, or saving.
+        This property provides access to the student model that has been trained
+        through the distillation process. The student model can be used
+        independently for inference, further training, or saving.
 
         Returns:
             keras.Model: The trained student model.
@@ -447,8 +489,8 @@ def get_student_model(self):
             # After training the distiller
             distiller.fit(x_train, y_train, epochs=10)
 
-            # Get the trained student model
-            trained_student = distiller.get_student_model()
+            # Access the trained student model
+            trained_student = distiller.student_model
 
             # Use the student model independently
             predictions = trained_student.predict(x_test)
@@ -605,10 +647,30 @@ def _custom_distillation_loss(self, teacher_outputs,
             # Get teacher outputs
             teacher_outputs = self.teacher(x, training=False)
 
-            # Apply the single strategy
-            distillation_loss = self.strategy.compute_loss(
-                teacher_outputs, y_pred
-            )
+            # Apply strategies
+            for i, (strategy, weight) in enumerate(
+                zip(self.strategies, self.strategy_weights)
+            ):
+                # Get the corresponding output for this strategy
+                if isinstance(y_pred, (list, tuple)) and i < len(y_pred):
+                    strategy_output = y_pred[i]
+                else:
+                    strategy_output = y_pred
+
+                if isinstance(teacher_outputs, (list, tuple)) and i < len(
+                    teacher_outputs
+                ):
+                    strategy_teacher_output = teacher_outputs[i]
+                else:
+                    strategy_teacher_output = teacher_outputs
+
+                # Compute loss for this strategy
+                strategy_loss = strategy.compute_loss(
+                    strategy_teacher_output, strategy_output
+                )
+
+                # Apply weight and add to total
+                distillation_loss += weight * strategy_loss
 
             # Ensure distillation_loss is a scalar
             if (
@@ -658,9 +720,11 @@ def get_config(self):
                 "student": serialization_lib.serialize_keras_object(
                     self.student
                 ),
-                "strategy": serialization_lib.serialize_keras_object(
-                    self.strategy
-                ),
+                "strategies": [
+                    serialization_lib.serialize_keras_object(strategy)
+                    for strategy in self.strategies
+                ],
+                "strategy_weights": self.strategy_weights,
                 "student_loss_weight": self.student_loss_weight,
                 "input_mapping": self.input_mapping,
                 "output_mapping": self.output_mapping,
@@ -678,7 +742,8 @@ def from_config(cls, config):
         config["student"] = serialization_lib.deserialize_keras_object(
             config["student"]
         )
-        config["strategy"] = serialization_lib.deserialize_keras_object(
-            config["strategy"]
-        )
+        config["strategies"] = [
+            serialization_lib.deserialize_keras_object(strategy)
+            for strategy in config["strategies"]
+        ]
         return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 5186357062d8..5877cc92e001 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -78,11 +78,13 @@ def test_distiller_initialization(self):
         # Check student_loss_weight
         self.assertEqual(self.distiller.student_loss_weight, 0.5)
 
-        # Check strategy
-        self.assertIsInstance(self.distiller.strategy, LogitsDistillation)
+        # Check strategies (should be a list with one strategy)
+        self.assertIsInstance(self.distiller.strategies, list)
+        self.assertEqual(len(self.distiller.strategies), 1)
+        self.assertIsInstance(self.distiller.strategies[0], LogitsDistillation)
 
         # Check that strategy has the correct temperature
-        self.assertEqual(self.distiller.strategy.temperature, 2.0)
+        self.assertEqual(self.distiller.strategies[0].temperature, 2.0)
 
         # Check that model is compiled
         self.assertIsNotNone(self.distiller.optimizer)
@@ -149,6 +151,70 @@ def test_model_compatibility_validation(self):
                 strategy=self.strategy,
             )
 
+    def test_multi_strategy_functionality(self):
+        """Test multi-strategy functionality."""
+        # Create multiple strategies
+        strategies = [
+            LogitsDistillation(temperature=3.0, output_index=0),
+            LogitsDistillation(temperature=2.0, output_index=0),
+        ]
+        strategy_weights = [1.0, 0.5]
+
+        # Create distiller with multiple strategies
+        distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=strategies,
+            strategy_weights=strategy_weights,
+            student_loss_weight=0.5,
+            optimizer="adam",
+            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Check that strategies are stored correctly
+        self.assertEqual(len(distiller.strategies), 2)
+        self.assertEqual(distiller.strategy_weights, [1.0, 0.5])
+        self.assertFalse(distiller.single_strategy)
+
+        # Test that both strategies have correct temperatures
+        self.assertEqual(distiller.strategies[0].temperature, 3.0)
+        self.assertEqual(distiller.strategies[1].temperature, 2.0)
+
+    def test_multi_strategy_validation(self):
+        """Test multi-strategy validation."""
+        strategies = [
+            LogitsDistillation(temperature=3.0, output_index=0),
+            LogitsDistillation(temperature=2.0, output_index=0),
+        ]
+
+        # Test with mismatched weights
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher=self.teacher,
+                student=self.student,
+                strategies=strategies,
+                strategy_weights=[1.0],  # Wrong length
+                student_loss_weight=0.5,
+            )
+
+        # Test with both strategy and strategies
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher=self.teacher,
+                student=self.student,
+                strategy=self.strategy,
+                strategies=strategies,
+                student_loss_weight=0.5,
+            )
+
+        # Test with neither strategy nor strategies
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher=self.teacher,
+                student=self.student,
+                student_loss_weight=0.5,
+            )
+
     def test_student_loss_weighting(self):
         # Test with student_loss_weight = 0.0 (only distillation loss)
         distiller_0 = Distiller(
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 68c6ba5ca7c9..9a0534c94667 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -8,7 +8,6 @@
 from keras.src.distillation.distiller import Distiller
 from keras.src.distillation.strategies import FeatureDistillation
 from keras.src.distillation.strategies import LogitsDistillation
-from keras.src.distillation.strategies import MultiOutputDistillation
 from keras.src.testing import TestCase
 
 
@@ -162,24 +161,15 @@ def test_feature_distillation_end_to_end(self):
 
 
 @pytest.mark.requires_trainable_backend
-class TestMultiOutputDistillation(TestCase):
-    """Essential test cases for MultiOutputDistillation strategy."""
+class TestMultiStrategyDistillation(TestCase):
+    """Essential test cases for multi-strategy distillation."""
 
-    def test_multi_output_distillation_end_to_end(self):
-        """Test multi-output distillation end-to-end."""
+    def test_multi_strategy_distillation_end_to_end(self):
+        """Test multi-strategy distillation end-to-end."""
         # Create strategies for different outputs
         logits_strategy = LogitsDistillation(temperature=2.0, output_index=0)
         feature_strategy = FeatureDistillation(loss_type="mse")
 
-        # Create multi-output strategy
-        strategy = MultiOutputDistillation(
-            output_strategies={
-                0: logits_strategy,
-                1: feature_strategy,
-            },
-            weights={0: 1.0, 1: 0.5},
-        )
-
         # Create dummy multi-output data
         teacher_outputs = [
             ops.convert_to_tensor(
@@ -198,15 +188,23 @@ def test_multi_output_distillation_end_to_end(self):
             ),
         ]
 
-        # Compute loss
-        loss = strategy.compute_loss(teacher_outputs, student_outputs)
+        # Test individual strategies
+        logits_loss = logits_strategy.compute_loss(
+            [teacher_outputs[0]], [student_outputs[0]]
+        )
+        feature_loss = feature_strategy.compute_loss(
+            [teacher_outputs[1]], [student_outputs[1]]
+        )
 
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
+        # Check that losses are scalar tensors
+        self.assertEqual(len(logits_loss.shape), 0)
+        self.assertEqual(len(feature_loss.shape), 0)
 
-        # Check that loss is finite and positive
-        self.assertTrue(ops.isfinite(loss))
-        self.assertGreater(loss, 0.0)
+        # Check that losses are finite and positive
+        self.assertTrue(ops.isfinite(logits_loss))
+        self.assertTrue(ops.isfinite(feature_loss))
+        self.assertGreater(logits_loss, 0.0)
+        self.assertGreater(feature_loss, 0.0)
 
     def test_end_to_end_with_multi_output_models(self):
         """Test end-to-end training with multi-output models."""
@@ -219,20 +217,19 @@ def test_end_to_end_with_multi_output_models(self):
         teacher.build((None, 5))
         student.build((None, 5))
 
-        # Create multi-output distillation strategy
-        multi_strategy = MultiOutputDistillation(
-            output_strategies={
-                0: LogitsDistillation(temperature=2.0, output_index=0),
-                1: FeatureDistillation(loss_type="mse"),
-            },
-            weights={0: 1.0, 1: 0.5},
-        )
+        # Create strategies list
+        strategies = [
+            LogitsDistillation(temperature=2.0, output_index=0),
+            FeatureDistillation(loss_type="mse"),
+        ]
+        strategy_weights = [1.0, 0.5]
 
-        # Create distiller
+        # Create distiller with strategies list
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=multi_strategy,
+            strategies=strategies,
+            strategy_weights=strategy_weights,
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
             student_loss=[

From 0b2d88fa74878c91a3a1caac5cef5d94cde2ddd2 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Thu, 28 Aug 2025 15:54:02 -0700
Subject: [PATCH 19/31] clean up after merge

---
 keras/src/distillation/distiller_test.py  |  15 +-
 keras/src/distillation/strategies.py      | 208 ----------------------
 keras/src/distillation/strategies_test.py |  50 +-----
 3 files changed, 12 insertions(+), 261 deletions(-)

diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 5877cc92e001..28219d6b1469 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -395,12 +395,12 @@ def test_get_student_model_method(self):
             student_loss="sparse_categorical_crossentropy",
         )
 
-        # Test that get_student_model returns the same as direct access
+        # Test that student_model property returns the same as direct access
         student_direct = distiller.student
-        student_method = distiller.get_student_model()
+        student_property = distiller.student_model
 
-        self.assertIs(student_direct, student_method)
-        self.assertEqual(student_method.name, self.student.name)
+        self.assertIs(student_direct, student_property)
+        self.assertEqual(student_property.name, self.student.name)
 
     def test_distiller_serialization_and_saving(self):
         """Test Distiller serialization, saving, and loading."""
@@ -455,7 +455,8 @@ def test_distiller_serialization_and_saving(self):
         required_keys = [
             "teacher",
             "student",
-            "strategy",
+            "strategies",
+            "strategy_weights",
             "student_loss_weight",
             "input_mapping",
             "output_mapping",
@@ -473,11 +474,11 @@ def test_distiller_serialization_and_saving(self):
         # Verify reconstruction
         self.assertEqual(reconstructed_distiller.student_loss_weight, 0.7)
         self.assertIsInstance(
-            reconstructed_distiller.strategy, LogitsDistillation
+            reconstructed_distiller.strategies[0], LogitsDistillation
         )
 
         # Verify strategy parameters
-        self.assertEqual(reconstructed_distiller.strategy.temperature, 3.0)
+        self.assertEqual(reconstructed_distiller.strategies[0].temperature, 3.0)
 
         # Test that reconstructed distiller can be used for inference
         reconstructed_output = reconstructed_distiller(x_test)
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 33f01f6fcb43..087408a7612b 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -1,7 +1,6 @@
 import keras
 from keras.src import ops
 from keras.src.api_export import keras_export
-from keras.src.saving import serialization_lib
 
 
 @keras_export("keras.distillation.BaseDistillationStrategy")
@@ -652,210 +651,3 @@ def get_config(self):
     def from_config(cls, config):
         """Create instance from configuration."""
         return cls(**config)
-
-
-@keras_export("keras.distillation.MultiOutputDistillation")
-class MultiOutputDistillation(BaseDistillationStrategy):
-    """Multi-output distillation strategy for models with multiple outputs.
-
-    Multi-output distillation handles models with multiple outputs, such as
-    object detection models (classification + regression), multi-task learning
-    models, or any model with multiple prediction heads. This strategy allows
-    different distillation approaches for different outputs.
-
-    How Multi-Output Distillation Works:
-
-    1. Output Mapping: Map each output index to a specific distillation
-       strategy. Different outputs can use different strategies based on their
-       nature (classification vs regression, different loss functions, etc.).
-
-    2. Strategy Application: Apply the appropriate strategy to each output
-       pair (teacher output i → student output i).
-
-    3. Loss Combination: Combine the losses from all outputs using
-       configurable weights. This allows prioritizing certain outputs over
-       others.
-
-    When to Use Multi-Output Distillation:
-
-    - Multi-Task Models: Models with multiple outputs (classification +
-      regression)
-    - Object Detection: Models with classification and bounding box outputs
-    - Segmentation: Models with classification and mask outputs
-    - Custom Architectures: Any model with multiple distinct outputs
-
-    Output Strategy Selection:
-
-    - Classification Outputs: Use `LogitsDistillation` with appropriate
-      temperature
-    - Regression Outputs: Use `LogitsDistillation` with lower temperature or
-      `FeatureDistillation` with MSE loss
-    - Feature Outputs: Use `FeatureDistillation` to transfer intermediate
-      representations
-    - Mixed Types: Combine different strategies for different outputs
-    - Custom Losses: Each strategy can be subclassed to override
-      `compute_loss` method
-
-    Weight Configuration:
-
-    - Equal Weights: Default behavior, all outputs weighted equally
-    - Task-Specific Weights: Weight outputs based on task importance
-    - Loss-Scale Weights: Adjust weights to balance different loss scales
-    - Performance-Based: Weight outputs based on their impact on final
-      performance
-
-    Args:
-        output_strategies: Dict mapping output indices to distillation
-            strategies. Each strategy will be applied to the corresponding
-            output. Example: `{0: LogitsDistillation(), 1:
-            FeatureDistillation()}`
-        weights: Dict mapping output indices to weights for combining losses.
-            Defaults to equal weights for all outputs. Example:
-            `{0: 1.0, 1: 0.5}`
-
-    Example:
-
-    ```python
-    # Multi-output distillation for object detection
-    strategy = MultiOutputDistillation(
-        output_strategies={
-            0: LogitsDistillation(temperature=3.0, output_index=0),
-            1: LogitsDistillation(temperature=1.0, output_index=1)
-        },
-        weights={0: 1.0, 1: 0.5}  # Weight classification more heavily
-    )
-
-    # Custom multi-output strategy
-    class CustomMultiOutputDistillation(MultiOutputDistillation):
-        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            # Get the outputs to distill
-            teacher_logits = teacher_outputs[0]
-            student_logits = student_outputs[0]
-
-            # Apply temperature scaling
-            teacher_logits = teacher_logits / 3.0
-            student_logits = student_logits / 3.0
-
-            # Custom loss computation
-            teacher_probs = ops.softmax(teacher_logits, axis=-1)
-            student_probs = ops.softmax(student_logits, axis=-1)
-            return ops.mean(
-                keras.losses.kl_divergence(teacher_probs, student_probs)
-            )
-
-    class CustomFeatureDistillation(FeatureDistillation):
-        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            teacher_features = teacher_outputs[0]
-            student_features = student_outputs[0]
-            return ops.mean(ops.abs(teacher_features - student_features))
-
-    strategy = MultiOutputDistillation(
-        output_strategies={
-            0: CustomLogitsDistillation(temperature=4.0, output_index=0),
-            1: CustomFeatureDistillation(output_index=1)
-        }
-    )
-
-    # Equal weighting for all outputs
-    strategy = MultiOutputDistillation(
-        output_strategies={
-            0: LogitsDistillation(temperature=3.0, output_index=0),
-            1: LogitsDistillation(temperature=3.0, output_index=1),
-            2: LogitsDistillation(temperature=3.0, output_index=2)
-        }
-        # weights=None (defaults to equal weights)
-    )
-    ```
-    """
-
-    def __init__(self, output_strategies, weights=None):
-        self.output_strategies = output_strategies
-        self.weights = weights or {idx: 1.0 for idx in output_strategies.keys()}
-
-    def validate_outputs(self, teacher_outputs, student_outputs):
-        """Validate outputs are compatible for multi-output distillation."""
-        super().validate_outputs(teacher_outputs, student_outputs)
-
-        # Ensure outputs are lists/tuples
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
-
-        # Check that all required outputs exist
-        max_output_index = max(self.output_strategies.keys())
-        if max_output_index >= len(teacher_outputs):
-            raise ValueError(
-                f"Teacher model doesn't have enough outputs. "
-                f"Required: {max_output_index + 1}, available: "
-                f"{len(teacher_outputs)}"
-            )
-        if max_output_index >= len(student_outputs):
-            raise ValueError(
-                f"Student model doesn't have enough outputs. "
-                f"Required: {max_output_index + 1}, available: "
-                f"{len(student_outputs)}"
-            )
-
-        # Validate each strategy with its corresponding outputs
-        for output_idx, strategy in self.output_strategies.items():
-            if hasattr(strategy, "validate_outputs"):
-                strategy.validate_outputs(
-                    [teacher_outputs[output_idx]], [student_outputs[output_idx]]
-                )
-
-    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-        """Compute multi-output distillation loss.
-
-        Args:
-            teacher_outputs: Outputs from teacher model.
-            student_outputs: Outputs from student model.
-            **kwargs: Additional arguments passed to individual strategies.
-
-        Returns:
-            Combined distillation loss tensor.
-        """
-        # Normalize outputs to lists
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
-
-        total_loss = 0.0
-
-        for output_idx, strategy in self.output_strategies.items():
-            teacher_output = teacher_outputs[output_idx]
-            student_output = student_outputs[output_idx]
-
-            # Compute loss for this output
-            output_loss = strategy.compute_loss(
-                [teacher_output], [student_output], **kwargs
-            )
-
-            # Apply weight
-            weight = self.weights.get(output_idx, 1.0)
-            total_loss += weight * output_loss
-
-        return total_loss
-
-    def get_config(self):
-        """Get configuration for serialization."""
-
-        return {
-            "output_strategies": {
-                k: serialization_lib.serialize_keras_object(v)
-                for k, v in self.output_strategies.items()
-            },
-            "weights": self.weights,
-        }
-
-    @classmethod
-    def from_config(cls, config):
-        """Create instance from configuration."""
-
-        # JSON keys must be strings, so we convert them back to int
-        config["output_strategies"] = {
-            int(k): serialization_lib.deserialize_keras_object(v)
-            for k, v in config["output_strategies"].items()
-        }
-        return cls(**config)
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index 9a0534c94667..ef01d3bb6a64 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -1,5 +1,3 @@
-import json
-
 import numpy as np
 import pytest
 
@@ -170,7 +168,7 @@ def test_multi_strategy_distillation_end_to_end(self):
         logits_strategy = LogitsDistillation(temperature=2.0, output_index=0)
         feature_strategy = FeatureDistillation(loss_type="mse")
 
-        # Create dummy multi-output data
+        # Create dummy multi-output data with very different values
         teacher_outputs = [
             ops.convert_to_tensor(
                 np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
@@ -181,10 +179,11 @@ def test_multi_strategy_distillation_end_to_end(self):
         ]
         student_outputs = [
             ops.convert_to_tensor(
-                np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
+                np.array([[10.0, 20.0, 30.0], [40.0, 50.0, 60.0]]),
+                dtype="float32",
             ),
             ops.convert_to_tensor(
-                np.array([[0.15, 0.25], [0.35, 0.45]]), dtype="float32"
+                np.array([[0.5, 0.6], [0.7, 0.8]]), dtype="float32"
             ),
         ]
 
@@ -267,44 +266,3 @@ def test_end_to_end_with_multi_output_models(self):
         self.assertEqual(
             predictions[0].shape, (5, 10)
         )  # Should return first output
-
-    def test_serialization(self):
-        """Test MultiOutputDistillation serialization and deserialization."""
-
-        # Create nested strategies
-        strategy1 = LogitsDistillation(temperature=3.0, output_index=0)
-        strategy2 = FeatureDistillation(loss_type="mse")
-
-        multi_strategy = MultiOutputDistillation(
-            output_strategies={0: strategy1, 1: strategy2},
-            weights={0: 1.0, 1: 0.5},
-        )
-
-        # Test get_config
-        config = multi_strategy.get_config()
-
-        # Verify structure
-        self.assertIn("output_strategies", config)
-        self.assertIn("weights", config)
-        self.assertEqual(config["weights"], {0: 1.0, 1: 0.5})
-
-        # Test JSON serialization
-        json_str = json.dumps(config)
-        self.assertIsInstance(json_str, str)
-
-        # Test from_config
-        reconstructed = MultiOutputDistillation.from_config(config)
-
-        # Verify reconstruction
-        self.assertEqual(len(reconstructed.output_strategies), 2)
-        self.assertEqual(reconstructed.weights, {0: 1.0, 1: 0.5})
-
-        # Verify nested strategies
-        self.assertIsInstance(
-            reconstructed.output_strategies[0], LogitsDistillation
-        )
-        self.assertIsInstance(
-            reconstructed.output_strategies[1], FeatureDistillation
-        )
-        self.assertEqual(reconstructed.output_strategies[0].temperature, 3.0)
-        self.assertEqual(reconstructed.output_strategies[1].loss_type, "mse")

From 9d8242c1900139f6697ae59ae922a276b8a0245e Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Fri, 29 Aug 2025 16:57:54 -0700
Subject: [PATCH 20/31] address comments

---
 .../_tf_keras/keras/distillation/__init__.py  |   2 +-
 keras/api/distillation/__init__.py            |   2 +-
 keras/src/distillation/distiller.py           | 423 ++++++++-----
 keras/src/distillation/distiller_test.py      |  85 +--
 keras/src/distillation/strategies.py          | 557 +++++++++---------
 keras/src/distillation/strategies_test.py     |  71 +--
 6 files changed, 632 insertions(+), 508 deletions(-)

diff --git a/keras/api/_tf_keras/keras/distillation/__init__.py b/keras/api/_tf_keras/keras/distillation/__init__.py
index 48dd038b3f3c..b1659fe83b6b 100644
--- a/keras/api/_tf_keras/keras/distillation/__init__.py
+++ b/keras/api/_tf_keras/keras/distillation/__init__.py
@@ -6,7 +6,7 @@
 
 from keras.src.distillation.distiller import Distiller as Distiller
 from keras.src.distillation.strategies import (
-    BaseDistillationStrategy as BaseDistillationStrategy,
+    DistillationLoss as DistillationLoss,
 )
 from keras.src.distillation.strategies import (
     FeatureDistillation as FeatureDistillation,
diff --git a/keras/api/distillation/__init__.py b/keras/api/distillation/__init__.py
index 48dd038b3f3c..b1659fe83b6b 100644
--- a/keras/api/distillation/__init__.py
+++ b/keras/api/distillation/__init__.py
@@ -6,7 +6,7 @@
 
 from keras.src.distillation.distiller import Distiller as Distiller
 from keras.src.distillation.strategies import (
-    BaseDistillationStrategy as BaseDistillationStrategy,
+    DistillationLoss as DistillationLoss,
 )
 from keras.src.distillation.strategies import (
     FeatureDistillation as FeatureDistillation,
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index cefec92489e3..e8d5b3d906d1 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -1,5 +1,5 @@
 import keras
-from keras.src import ops
+from keras.src import tree
 from keras.src.api_export import keras_export
 from keras.src.models.model import Model
 from keras.src.saving import serialization_lib
@@ -59,7 +59,7 @@ class Distiller(Model):
       its corresponding output.
 
     - Custom Strategies: Create custom strategies by subclassing
-      `BaseDistillationStrategy` and overriding the `compute_loss` method.
+      `DistillationLoss` and overriding the `compute_loss` method.
 
     Args:
         teacher: A trained `keras.Model` that serves as the knowledge source.
@@ -94,7 +94,9 @@ class Distiller(Model):
     import keras_hub as hub
 
     teacher = hub.models.CausalLM.from_preset("gemma3_4b_en")
-    student = hub.models.CausalLM.from_preset("gemma2_2b_en")
+    student = hub.models.CausalLM.from_preset(
+        "gemma2_2b_en", load_weights=False
+    )
 
     # Create distillation strategy
     strategy = LogitsDistillation(temperature=3.0)
@@ -153,10 +155,6 @@ def __init__(
         name="distiller",
         **kwargs,
     ):
-        # Extract input_mapping and output_mapping before super().__init__
-        self.input_mapping = kwargs.pop("input_mapping", None)
-        self.output_mapping = kwargs.pop("output_mapping", None)
-
         super().__init__(name=name, **kwargs)
 
         # Validate inputs
@@ -185,41 +183,28 @@ def __init__(
                 f"metrics must be a list or tuple, got {type(metrics)}"
             )
 
-        # Convert string loss to function if needed
-        if isinstance(student_loss, str):
-            self._student_loss = keras.losses.get(student_loss)
-            if self._student_loss is None:
+            # Convert string loss to function using tree.map_structure
+
+        def convert_loss_to_function(loss):
+            if isinstance(loss, str):
+                loss_fn = keras.losses.get(loss)
+                if loss_fn is None:
+                    raise ValueError(
+                        f"Unknown loss function: '{loss}'. "
+                        "Please provide a valid loss function name or instance."
+                    )
+                return loss_fn
+            elif loss is None:
                 raise ValueError(
-                    f"Unknown loss function: '{student_loss}'. "
-                    "Please provide a valid loss function name or instance."
+                    "Student loss function cannot be None. "
+                    "Please provide a valid 'student_loss' parameter."
                 )
-        elif isinstance(student_loss, list):
-            # Handle multi-output loss functions
-            self._student_loss = []
-            for i, loss in enumerate(student_loss):
-                if isinstance(loss, str):
-                    loss_fn = keras.losses.get(loss)
-                    if loss_fn is None:
-                        raise ValueError(
-                            f"Unknown loss function at index {i}: '{loss}'. "
-                            "Please provide valid loss function names or "
-                            "instances."
-                        )
-                    self._student_loss.append(loss_fn)
-                else:
-                    self._student_loss.append(loss)
-        else:
-            self._student_loss = student_loss
-
-        # Validate that we have a valid loss function
-        if self._student_loss is None:
-            raise ValueError(
-                "Student loss function cannot be None. "
-                "Please provide a valid 'student_loss' parameter."
-            )
+            else:
+                return loss
 
-        # Validate architecture compatibility for feature distillation
-        self._validate_architecture_compatibility(teacher, student)
+        self._student_loss = tree.map_structure(
+            convert_loss_to_function, student_loss
+        )
 
         # Handle strategy configuration
         if strategy is not None and strategies is not None:
@@ -267,6 +252,9 @@ def __init__(
         for strategy in self.strategies:
             self._validate_strategy_compatibility(teacher, student, strategy)
 
+        # Create efficient multi-layer feature extractors
+        self._create_multi_feature_extractors()
+
         # Freeze teacher model
         self.teacher.trainable = False
 
@@ -433,12 +421,6 @@ def _validate_dtype_compatibility(self, teacher, student):
                     f"Both models must use the same data type."
                 )
 
-    def _validate_architecture_compatibility(self, teacher, student):
-        """Validate architecture compatibility for feature distillation."""
-        # This validation is strategy-specific and will be called by strategies
-        # that require specific architectural compatibility
-        pass
-
     def _validate_strategy_compatibility(self, teacher, student, strategy):
         """Validate that the strategy is compatible with the teacher and student
         models."""
@@ -473,6 +455,195 @@ def _shapes_are_compatible(self, shape1, shape2):
                 return False
         return True
 
+    def _create_multi_feature_extractors(self):
+        """Create efficient feature extractors that extract all needed features
+        in single forward passes.
+
+        This method analyzes all FeatureDistillation strategies to determine
+        which layers need feature extraction, then creates models that extract
+        all required features in one pass to avoid redundant computation.
+        """
+        # Collect all layer names needed for feature extraction
+        teacher_layer_names = []
+        student_layer_names = []
+
+        for strategy in self.strategies:
+            if (
+                hasattr(strategy, "teacher_layer_name")
+                and strategy.teacher_layer_name
+            ):
+                if strategy.teacher_layer_name not in teacher_layer_names:
+                    teacher_layer_names.append(strategy.teacher_layer_name)
+            if (
+                hasattr(strategy, "student_layer_name")
+                and strategy.student_layer_name
+            ):
+                if strategy.student_layer_name not in student_layer_names:
+                    student_layer_names.append(strategy.student_layer_name)
+
+        # Create multi-output feature extractors if needed
+        self._teacher_feature_extractor = None
+        self._student_feature_extractor = None
+        self._teacher_layer_outputs = {}
+        self._student_layer_outputs = {}
+
+        if teacher_layer_names:
+            try:
+                # For Sequential models, use the last layer's output as final
+                if isinstance(self.teacher, keras.Sequential):
+                    final_output = self.teacher.layers[-1].output
+                    inputs = self.teacher.layers[0].input
+                else:
+                    # For Functional models
+                    if (
+                        not hasattr(self.teacher, "inputs")
+                        or self.teacher.inputs is None
+                    ):
+                        raise ValueError("Teacher model has no defined inputs")
+                    if (
+                        not hasattr(self.teacher, "output")
+                        or self.teacher.output is None
+                    ):
+                        raise ValueError("Teacher model has no defined output")
+                    final_output = self.teacher.output
+                    inputs = self.teacher.inputs
+
+                teacher_outputs = [final_output]  # Always include final output
+                teacher_output_names = ["final_output"]
+
+                for layer_name in teacher_layer_names:
+                    layer = self.teacher.get_layer(name=layer_name)
+                    teacher_outputs.append(layer.output)
+                    teacher_output_names.append(layer_name)
+
+                self._teacher_feature_extractor = keras.Model(
+                    inputs=inputs,
+                    outputs=teacher_outputs,
+                    name=f"{self.teacher.name}_multi_feature_extractor",
+                )
+                self._teacher_output_names = teacher_output_names
+            except (ValueError, AttributeError):
+                # Fallback to individual extraction for subclassed models
+                self._teacher_feature_extractor = None
+
+        if student_layer_names:
+            try:
+                # For Sequential models, use the last layer's output as final
+                if isinstance(self.student, keras.Sequential):
+                    final_output = self.student.layers[-1].output
+                    inputs = self.student.layers[0].input
+                else:
+                    # For Functional models
+                    if (
+                        not hasattr(self.student, "inputs")
+                        or self.student.inputs is None
+                    ):
+                        raise ValueError("Student model has no defined inputs")
+                    if (
+                        not hasattr(self.student, "output")
+                        or self.student.output is None
+                    ):
+                        raise ValueError("Student model has no defined output")
+                    final_output = self.student.output
+                    inputs = self.student.inputs
+
+                student_outputs = [final_output]  # Always include final output
+                student_output_names = ["final_output"]
+
+                for layer_name in student_layer_names:
+                    layer = self.student.get_layer(name=layer_name)
+                    student_outputs.append(layer.output)
+                    student_output_names.append(layer_name)
+
+                self._student_feature_extractor = keras.Model(
+                    inputs=inputs,
+                    outputs=student_outputs,
+                    name=f"{self.student.name}_multi_feature_extractor",
+                )
+                self._student_output_names = student_output_names
+            except (ValueError, AttributeError):
+                # Fallback to individual extraction for subclassed models
+                self._student_feature_extractor = None
+
+    def _extract_all_teacher_features(self, x):
+        """Extract all teacher features efficiently in a single forward pass.
+
+        Args:
+            x: Input data.
+
+        Returns:
+            Dict mapping layer names to their outputs, including 'final_output'.
+        """
+        if self._teacher_feature_extractor is not None:
+            # Use efficient multi-output extractor
+            feature_outputs = self._teacher_feature_extractor(x, training=False)
+            if not isinstance(feature_outputs, (list, tuple)):
+                feature_outputs = [feature_outputs]
+
+            # Map outputs to layer names
+            features = {}
+            for name, output in zip(
+                self._teacher_output_names, feature_outputs
+            ):
+                features[name] = output
+            return features
+        else:
+            # Fallback: just get final output for LogitsDistillation
+            return {"final_output": self.teacher(x, training=False)}
+
+    def _extract_all_student_features(self, x, y_pred):
+        """Extract all student features efficiently in a single forward pass.
+
+        Args:
+            x: Input data.
+            y_pred: Student predictions from forward pass (to avoid
+                recomputation).
+
+        Returns:
+            Dict mapping layer names to their outputs, including 'final_output'.
+        """
+        if self._student_feature_extractor is not None:
+            # Use efficient multi-output extractor
+            feature_outputs = self._student_feature_extractor(x, training=True)
+            if not isinstance(feature_outputs, (list, tuple)):
+                feature_outputs = [feature_outputs]
+
+            # Map outputs to layer names
+            features = {}
+            for name, output in zip(
+                self._student_output_names, feature_outputs
+            ):
+                features[name] = output
+            return features
+        else:
+            # Fallback: use y_pred for final output to avoid recomputation
+            return {"final_output": y_pred}
+
+    def _get_strategy_features(self, strategy, all_features, is_teacher):
+        """Get the specific features needed by a strategy from pre-extracted
+        features.
+
+        Args:
+            strategy: The FeatureDistillation strategy.
+            all_features: Dict of all extracted features.
+            is_teacher: Whether these are teacher features.
+
+        Returns:
+            The specific features needed by this strategy.
+        """
+        if is_teacher:
+            layer_name = strategy.teacher_layer_name or "final_output"
+        else:
+            layer_name = strategy.student_layer_name or "final_output"
+
+        if layer_name not in all_features:
+            raise ValueError(
+                f"Layer '{layer_name}' features not found in extracted "
+                f"features. Available features: {list(all_features.keys())}"
+            )
+
+        return all_features[layer_name]
+
     @property
     def student_model(self):
         """The trained student model for independent use.
@@ -543,7 +714,9 @@ def compute_loss(self, x=None, y=None, y_pred=None,
 
                     # Custom distillation loss computation
                     teacher_outputs = self.teacher(x, training=False)
-                    student_outputs = self.student(x, training=training)
+                    # Use y_pred (student output from forward pass) instead of
+                    # recomputing
+                    student_outputs = y_pred
 
                     # Custom loss logic here
                     distillation_loss = self._custom_distillation_loss(
@@ -558,115 +731,85 @@ def compute_loss(self, x=None, y=None, y_pred=None,
                 def _custom_distillation_loss(self, teacher_outputs,
                                             student_outputs):
                     # Implement custom distillation loss logic
-                    from keras import ops
-                    return ops.mean(
-                        ops.square(teacher_outputs - student_outputs)
+                    return keras.ops.mean(
+                        keras.ops.square(teacher_outputs - student_outputs)
                     )
             ```
         """
-        # Normalize y_pred and y to lists for consistent handling
-        if not isinstance(y_pred, (list, tuple)):
-            y_pred = [y_pred]
-        if y is not None and not isinstance(y, (list, tuple)):
-            y = [y]
-
-        # Compute student loss
+        # Compute student loss using tree operations for dicts, manual for lists
         student_loss = 0.0
         if self.student_loss_weight > 0.0 and y is not None:
-            # Use the configured loss function
-            if (
-                hasattr(self, "_student_loss")
-                and self._student_loss is not None
-            ):
-                if isinstance(self._student_loss, list):
-                    # Multi-output loss
-                    if isinstance(y_pred, list) and len(y_pred) > 0:
-                        # Validate lengths match
-                        if len(y) != len(y_pred):
-                            raise ValueError(
-                                f"Number of targets ({len(y)}) must match "
-                                f"number of predictions ({len(y_pred)}) for "
-                                f"multi-output loss computation."
-                            )
-                        if len(self._student_loss) != len(y):
-                            raise ValueError(
-                                f"Number of loss functions "
-                                f"({len(self._student_loss)}) must match "
-                                f"number of outputs ({len(y)}) for "
-                                f"multi-output loss computation."
-                            )
-
-                        # Compute loss for each output
-                        student_loss = sum(
-                            loss_fn(y[i], y_pred[i])
-                            for i, loss_fn in enumerate(self._student_loss)
-                        )
-                    else:
-                        # Single output with multi-output loss list
-                        if len(self._student_loss) != 1:
-                            raise ValueError(
-                                f"Single output provided but "
-                                f"{len(self._student_loss)} loss functions "
-                                f"configured. Use a single loss function or "
-                                f"provide multiple outputs."
-                            )
-                        student_loss = self._student_loss[0](y[0], y_pred[0])
-                else:
-                    # Single loss function
-                    if isinstance(y_pred, list) and len(y_pred) > 0:
-                        # Multi-output with single loss function
-                        if len(y) != len(y_pred):
-                            raise ValueError(
-                                f"Number of targets ({len(y)}) must match "
-                                f"number of predictions ({len(y_pred)}) for "
-                                f"multi-output loss computation."
-                            )
-                        # Use first output for student loss (consistent
-                        # behavior)
-                        student_loss = self._student_loss(y[0], y_pred[0])
-                    else:
-                        # Single output with single loss function
-                        student_loss = self._student_loss(y, y_pred)
+            if isinstance(self._student_loss, dict):
+                # Dict case - check keys match at runtime (keys can change)
+                loss_keys = set(self._student_loss.keys())
+                y_keys = set(y.keys())
+                pred_keys = set(y_pred.keys())
+                if loss_keys != y_keys or y_keys != pred_keys:
+                    raise ValueError(
+                        f"Keys must match across loss functions, targets, and "
+                        f"predictions. Loss keys: {loss_keys}, "
+                        f"Target keys: {y_keys}, Prediction keys: {pred_keys}"
+                    )
+
+                # Compute losses manually and sum using tree.flatten
+                loss_values = {
+                    key: self._student_loss[key](y[key], y_pred[key])
+                    for key in self._student_loss.keys()
+                }
+                flat_losses = tree.flatten(loss_values)
+                student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
+            elif isinstance(self._student_loss, (list, tuple)):
+                # List/tuple case - check lengths match at runtime (can change)
+                if len(y) != len(y_pred) or len(self._student_loss) != len(y):
+                    raise ValueError(
+                        f"Number of targets ({len(y)}), predictions "
+                        f"({len(y_pred)}), and loss functions "
+                        f"({len(self._student_loss)}) must match."
+                    )
+
+                # Compute losses manually and sum using tree.flatten
+                loss_values = [
+                    loss_fn(y_true, y_pred_i)
+                    for loss_fn, y_true, y_pred_i in zip(
+                        self._student_loss, y, y_pred
+                    )
+                ]
+                flat_losses = tree.flatten(loss_values)
+                student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
             else:
-                # No loss function configured - this is an error
-                raise ValueError(
-                    "Student loss function is not configured. "
-                    "Please provide a valid 'student_loss' parameter to the "
-                    "Distiller constructor. "
-                    "Examples: 'sparse_categorical_crossentropy', "
-                    "'categorical_crossentropy', or a custom loss function."
-                )
+                # Single output case
+                student_loss = self._student_loss(y, y_pred)
 
             # Ensure student_loss is a scalar
             if hasattr(student_loss, "shape") and len(student_loss.shape) > 0:
-                student_loss = ops.mean(student_loss)
+                student_loss = keras.ops.mean(student_loss)
 
         # Compute distillation loss
         distillation_loss = 0.0
         if self.student_loss_weight < 1.0:
-            # Get teacher outputs
-            teacher_outputs = self.teacher(x, training=False)
-
-            # Apply strategies
-            for i, (strategy, weight) in enumerate(
-                zip(self.strategies, self.strategy_weights)
-            ):
-                # Get the corresponding output for this strategy
-                if isinstance(y_pred, (list, tuple)) and i < len(y_pred):
-                    strategy_output = y_pred[i]
-                else:
-                    strategy_output = y_pred
-
-                if isinstance(teacher_outputs, (list, tuple)) and i < len(
-                    teacher_outputs
-                ):
-                    strategy_teacher_output = teacher_outputs[i]
+            # Extract all features efficiently in single forward passes
+            teacher_features = self._extract_all_teacher_features(x)
+            student_features = self._extract_all_student_features(x, y_pred)
+
+            # Apply strategies using pre-extracted features
+            for strategy, weight in zip(self.strategies, self.strategy_weights):
+                # Get appropriate outputs/features for this strategy
+                if hasattr(strategy, "teacher_layer_name"):
+                    # FeatureDistillation - use extracted features
+                    strategy_teacher_output = self._get_strategy_features(
+                        strategy, teacher_features, is_teacher=True
+                    )
+                    strategy_student_output = self._get_strategy_features(
+                        strategy, student_features, is_teacher=False
+                    )
                 else:
-                    strategy_teacher_output = teacher_outputs
+                    # LogitsDistillation - use final model outputs
+                    strategy_teacher_output = teacher_features["final_output"]
+                    strategy_student_output = y_pred
 
                 # Compute loss for this strategy
                 strategy_loss = strategy.compute_loss(
-                    strategy_teacher_output, strategy_output
+                    strategy_teacher_output, strategy_student_output
                 )
 
                 # Apply weight and add to total
@@ -677,7 +820,7 @@ def _custom_distillation_loss(self, teacher_outputs,
                 hasattr(distillation_loss, "shape")
                 and len(distillation_loss.shape) > 0
             ):
-                distillation_loss = ops.mean(distillation_loss)
+                distillation_loss = keras.ops.mean(distillation_loss)
 
         # Combine losses
         total_loss = (
@@ -726,8 +869,6 @@ def get_config(self):
                 ],
                 "strategy_weights": self.strategy_weights,
                 "student_loss_weight": self.student_loss_weight,
-                "input_mapping": self.input_mapping,
-                "output_mapping": self.output_mapping,
             }
         )
         return config
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 28219d6b1469..da72a8ddb604 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -155,10 +155,10 @@ def test_multi_strategy_functionality(self):
         """Test multi-strategy functionality."""
         # Create multiple strategies
         strategies = [
-            LogitsDistillation(temperature=3.0, output_index=0),
-            LogitsDistillation(temperature=2.0, output_index=0),
+            LogitsDistillation(temperature=3.0),
+            LogitsDistillation(temperature=2.0),
         ]
-        strategy_weights = [1.0, 0.5]
+        strategy_weights = [0.7, 0.3]
 
         # Create distiller with multiple strategies
         distiller = Distiller(
@@ -171,48 +171,49 @@ def test_multi_strategy_functionality(self):
             student_loss="sparse_categorical_crossentropy",
         )
 
-        # Check that strategies are stored correctly
+        # Test that strategies are stored correctly
         self.assertEqual(len(distiller.strategies), 2)
-        self.assertEqual(distiller.strategy_weights, [1.0, 0.5])
-        self.assertFalse(distiller.single_strategy)
+        self.assertEqual(distiller.strategy_weights, [0.7, 0.3])
 
-        # Test that both strategies have correct temperatures
-        self.assertEqual(distiller.strategies[0].temperature, 3.0)
-        self.assertEqual(distiller.strategies[1].temperature, 2.0)
+        # Test training
+        x = np.random.random((10, 8)).astype(np.float32)
+        y = np.random.randint(0, 10, (10,))
+        history = distiller.fit(x, y, epochs=1, verbose=0)
+
+        # Check metrics
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
 
     def test_multi_strategy_validation(self):
         """Test multi-strategy validation."""
         strategies = [
-            LogitsDistillation(temperature=3.0, output_index=0),
-            LogitsDistillation(temperature=2.0, output_index=0),
+            LogitsDistillation(temperature=3.0),
+            LogitsDistillation(temperature=2.0),
         ]
 
-        # Test with mismatched weights
-        with self.assertRaises(ValueError):
-            Distiller(
-                teacher=self.teacher,
-                student=self.student,
-                strategies=strategies,
-                strategy_weights=[1.0],  # Wrong length
-                student_loss_weight=0.5,
-            )
+        # Test that validation passes for valid configurations
+        distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=strategies,
+            student_loss_weight=0.5,
+            optimizer="adam",
+            student_loss="sparse_categorical_crossentropy",
+        )
 
-        # Test with both strategy and strategies
-        with self.assertRaises(ValueError):
-            Distiller(
-                teacher=self.teacher,
-                student=self.student,
-                strategy=self.strategy,
-                strategies=strategies,
-                student_loss_weight=0.5,
-            )
+        self.assertEqual(len(distiller.strategies), 2)
 
-        # Test with neither strategy nor strategies
+        # Test invalid strategy weights length
         with self.assertRaises(ValueError):
             Distiller(
                 teacher=self.teacher,
                 student=self.student,
+                strategies=strategies,
+                strategy_weights=[1.0],  # Wrong length
                 student_loss_weight=0.5,
+                optimizer="adam",
+                student_loss="sparse_categorical_crossentropy",
             )
 
     def test_student_loss_weighting(self):
@@ -384,24 +385,6 @@ def test_prediction_workflow(self):
         prediction_sums = np.sum(predictions, axis=1)
         self.assertTrue(np.all(np.isfinite(prediction_sums)))
 
-    def test_get_student_model_method(self):
-        """Test the get_student_model() convenience method."""
-        distiller = Distiller(
-            teacher=self.teacher,
-            student=self.student,
-            strategy=self.strategy,
-            student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(),
-            student_loss="sparse_categorical_crossentropy",
-        )
-
-        # Test that student_model property returns the same as direct access
-        student_direct = distiller.student
-        student_property = distiller.student_model
-
-        self.assertIs(student_direct, student_property)
-        self.assertEqual(student_property.name, self.student.name)
-
     def test_distiller_serialization_and_saving(self):
         """Test Distiller serialization, saving, and loading."""
 
@@ -431,9 +414,7 @@ def test_distiller_serialization_and_saving(self):
         )
 
         # Create distiller with single strategy
-        strategy = LogitsDistillation(
-            temperature=3.0, loss_type="kl_divergence"
-        )
+        strategy = LogitsDistillation(temperature=3.0, loss="kl_divergence")
 
         original_distiller = Distiller(
             teacher=teacher,
@@ -458,8 +439,6 @@ def test_distiller_serialization_and_saving(self):
             "strategies",
             "strategy_weights",
             "student_loss_weight",
-            "input_mapping",
-            "output_mapping",
         ]
         for key in required_keys:
             self.assertIn(key, config, f"Missing key: {key}")
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/strategies.py
index 087408a7612b..814f3a87db4a 100644
--- a/keras/src/distillation/strategies.py
+++ b/keras/src/distillation/strategies.py
@@ -1,18 +1,18 @@
 import keras
-from keras.src import ops
+from keras.src import tree
 from keras.src.api_export import keras_export
 
 
-@keras_export("keras.distillation.BaseDistillationStrategy")
-class BaseDistillationStrategy:
-    """Base class for distillation strategies.
+@keras_export("keras.distillation.DistillationLoss")
+class DistillationLoss:
+    """Base class for distillation loss computation.
 
-    Distillation strategies define how to compute the distillation loss
-    between teacher and student outputs. Each strategy implements a specific
-    approach to knowledge transfer, from simple logits matching to multi-output
+    Distillation losses define how to compute the distillation loss
+    between teacher and student outputs. Each loss implements a specific
+    approach to knowledge transfer, from simple logits matching to feature-based
     distillation.
 
-    To create custom distillation strategies, subclass this class and
+    To create custom distillation losses, subclass this class and
     override the `compute_loss` method.
     """
 
@@ -62,7 +62,7 @@ def validate_outputs(self, teacher_outputs, student_outputs):
 
 
 @keras_export("keras.distillation.LogitsDistillation")
-class LogitsDistillation(BaseDistillationStrategy):
+class LogitsDistillation(DistillationLoss):
     """Distillation strategy that transfers knowledge from final model outputs.
 
     This strategy applies temperature scaling to the teacher's logits before
@@ -76,8 +76,7 @@ class LogitsDistillation(BaseDistillationStrategy):
        probability distributions that reveal relationships between classes.
 
     2. Loss Computation: The loss is computed between the temperature-scaled
-       teacher logits and student logits using either KL divergence or
-       categorical crossentropy.
+       teacher logits and student logits using the specified loss function.
 
     When to Use Logits Distillation:
 
@@ -98,71 +97,95 @@ class LogitsDistillation(BaseDistillationStrategy):
         temperature: Temperature for softmax scaling. Higher values produce
             softer probability distributions that are easier for the student to
             learn. Typical values range from 3-5. Defaults to 3.0.
-        loss_type: Type of loss function to use. Options:
-            - `"kl_divergence"`: KL divergence between teacher and student
-              distributions
-            - `"categorical_crossentropy"`: Crossentropy with teacher as target
-        output_index: Index of the output to use for multi-output models.
-            Defaults to 0.
+        loss: Loss function(s) to use for distillation. Can be:
+            - String identifier (e.g., 'kl_divergence',
+              'categorical_crossentropy')
+            - Keras loss instance
+            - List/tuple of losses for multi-output models
+            - Dict of losses for named outputs
+            The structure must match the model's output structure.
+            Defaults to 'kl_divergence'.
 
     Example:
 
     ```python
-    from keras import ops
-    # Basic logits distillation
+    # Basic logits distillation with KL divergence
     strategy = LogitsDistillation(temperature=3.0)
 
     # With categorical crossentropy loss
     strategy = LogitsDistillation(
         temperature=4.0,
-        loss_type="categorical_crossentropy"
+        loss="categorical_crossentropy"
+    )
+
+    # With custom loss instance
+    strategy = LogitsDistillation(
+        temperature=4.0,
+        loss=keras.losses.CategoricalCrossentropy(from_logits=True)
+    )
+
+    # For multi-output models with list structure
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss=["kl_divergence", "categorical_crossentropy"]
+    )
+
+    # For multi-output models with dict structure
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss={
+            "classification": "kl_divergence",
+            "regression": "mse"
+        }
     )
 
     # Custom loss by subclassing
     class CustomLogitsDistillation(LogitsDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            # Get the outputs to distill
-            teacher_logits = teacher_outputs[self.output_index]
-            student_logits = student_outputs[self.output_index]
-
-            # Apply temperature scaling
-            teacher_logits = teacher_logits / self.temperature
-            student_logits = student_logits / self.temperature
+            # Apply temperature scaling using tree.map_structure
+            teacher_scaled = tree.map_structure(
+                lambda x: x / self.temperature, teacher_outputs
+            )
+            student_scaled = tree.map_structure(
+                lambda x: x / self.temperature, student_outputs
+            )
 
             # Custom loss computation
-            teacher_probs = ops.softmax(teacher_logits, axis=-1)
-            student_probs = ops.softmax(student_logits, axis=-1)
-            return ops.mean(
-                keras.losses.kl_divergence(teacher_probs, student_probs)
+            return tree.map_structure(
+                lambda t, s: keras.ops.mean(
+                    keras.losses.kl_divergence(
+                        keras.ops.softmax(t, axis=-1),
+                        keras.ops.softmax(s, axis=-1)
+                    )
+                ),
+                teacher_scaled,
+                student_scaled
             )
-
-    strategy = CustomLogitsDistillation(temperature=3.0)
-
-    # For multi-output models
-    strategy = LogitsDistillation(
-        temperature=3.0,
-        output_index=1  # Use second output
-    )
     ```
     """
 
     def __init__(
         self,
         temperature=3.0,
-        loss_type="kl_divergence",
-        output_index=0,
+        loss="kl_divergence",
     ):
         super().__init__()
         self.temperature = temperature
-        self.loss_type = loss_type
-        self.output_index = output_index
 
-        # Validate loss_type
-        if loss_type not in ["kl_divergence", "categorical_crossentropy"]:
-            raise ValueError(
-                f"loss_type must be one of ['kl_divergence', "
-                f"'categorical_crossentropy'], got {loss_type}"
-            )
+        # Convert loss structure to functions using tree.map_structure
+        def convert_loss_to_function(loss_item):
+            if isinstance(loss_item, str):
+                loss_fn = keras.losses.get(loss_item)
+                if loss_fn is None:
+                    raise ValueError(
+                        f"Unknown loss function: '{loss_item}'. "
+                        "Please provide a valid loss function name or instance."
+                    )
+                return loss_fn
+            else:
+                return loss_item
+
+        self.loss = tree.map_structure(convert_loss_to_function, loss)
 
         # Validate temperature
         if not isinstance(self.temperature, (int, float)):
@@ -178,109 +201,81 @@ def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for logits distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
 
-        # Ensure outputs are lists/tuples
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
-
-        # Check output index is valid
-        if self.output_index >= len(teacher_outputs):
-            raise ValueError(
-                f"output_index {self.output_index} is out of range. "
-                f"Teacher has {len(teacher_outputs)} outputs."
-            )
-        if self.output_index >= len(student_outputs):
-            raise ValueError(
-                f"output_index {self.output_index} is out of range. "
-                f"Student has {len(student_outputs)} outputs."
-            )
-
-        # Check that the selected outputs have compatible shapes
-        teacher_output = teacher_outputs[self.output_index]
-        student_output = student_outputs[self.output_index]
-
-        if teacher_output.shape[-1] != student_output.shape[-1]:
+        # Validate that loss structure matches output structure
+        try:
+            tree.assert_same_structure(self.loss, teacher_outputs)
+            tree.assert_same_structure(self.loss, student_outputs)
+        except ValueError as e:
             raise ValueError(
-                f"Teacher and student outputs must have the same number of "
-                f"classes. "
-                f"Teacher output shape: {teacher_output.shape}, "
-                f"Student output shape: {student_output.shape}"
+                f"Loss structure must match output structure. "
+                f"Loss structure: {tree.structure(self.loss)}, "
+                f"Teacher output structure: {tree.structure(teacher_outputs)}, "
+                f"Student output structure: {tree.structure(student_outputs)}. "
+                f"Error: {e}"
             )
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-        """Compute distillation loss using Keras built-in loss functions.
+        """Compute distillation loss using the configured loss function.
 
         Args:
-            teacher_outputs: Logits from teacher model. Can be a single tensor
-                or a list/tuple of tensors for multi-output models.
-            student_outputs: Logits from student model. Can be a single tensor
-                or a list/tuple of tensors for multi-output models.
+            teacher_outputs: Logits from teacher model. Can be a single tensor,
+                list/tuple of tensors, or dict of tensors.
+            student_outputs: Logits from student model. Can be a single tensor,
+                list/tuple of tensors, or dict of tensors.
             **kwargs: Additional arguments (ignored).
         Returns:
             Distillation loss tensor.
         """
+        # Apply temperature scaling using tree.map_structure
+        teacher_scaled = tree.map_structure(
+            lambda x: x / self.temperature, teacher_outputs
+        )
+        student_scaled = tree.map_structure(
+            lambda x: x / self.temperature, student_outputs
+        )
+
+        # Apply loss function(s) to corresponding outputs
+        def apply_loss(loss_fn, teacher_logits, student_logits):
+            # Special handling for KL divergence (needs probabilities)
+            if (
+                hasattr(loss_fn, "__name__")
+                and "kl" in loss_fn.__name__.lower()
+            ):
+                teacher_probs = keras.ops.softmax(teacher_logits, axis=-1)
+                student_probs = keras.ops.softmax(student_logits, axis=-1)
+                loss = keras.ops.mean(loss_fn(teacher_probs, student_probs))
+                # Scale by temperature^2 for KL (per literature)
+                return loss * (self.temperature**2)
+            else:
+                # For other losses, use logits directly
+                return keras.ops.mean(loss_fn(teacher_logits, student_logits))
 
-        # Normalize outputs to lists
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
-
-        # Get the outputs to distill
-        teacher_logits = teacher_outputs[self.output_index]
-        student_logits = student_outputs[self.output_index]
-
-        # Apply temperature scaling
-        teacher_logits = teacher_logits / self.temperature
-        student_logits = student_logits / self.temperature
-
-        if self.loss_type == "kl_divergence":
-            # Convert to probabilities for KL divergence
-            teacher_probs = ops.softmax(teacher_logits, axis=-1)
-            student_probs = ops.softmax(student_logits, axis=-1)
-
-            # Use Keras KLDivergence directly and reduce to scalar
-            loss = ops.mean(
-                keras.losses.kl_divergence(teacher_probs, student_probs)
-            )
-
-            # Scale by temperature^2 for KL (per literature)
-            return loss * (self.temperature**2)
-
-        elif self.loss_type == "categorical_crossentropy":
-            # Convert teacher to probabilities, keep student as logits and
-            # pass from_logits=True for correct computation.
-            teacher_probs = ops.softmax(teacher_logits, axis=-1)
-
-            loss = ops.mean(
-                keras.losses.categorical_crossentropy(
-                    teacher_probs, student_logits, from_logits=True
-                )
-            )
-
-            # Do NOT scale by temperature^2 for categorical crossentropy
-            return loss
+        # Apply losses using tree.map_structure
+        loss_values = tree.map_structure(
+            apply_loss, self.loss, teacher_scaled, student_scaled
+        )
 
-        else:
-            raise ValueError(f"Unknown loss_type: {self.loss_type}")
+        # Sum all losses and return scalar
+        flat_losses = tree.flatten(loss_values)
+        return keras.ops.sum(keras.ops.stack(flat_losses))
 
     def get_config(self):
         """Get configuration for serialization."""
         return {
             "temperature": self.temperature,
-            "loss_type": self.loss_type,
-            "output_index": self.output_index,
+            "loss": keras.losses.serialize(self.loss),
         }
 
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
+        config = config.copy()
+        config["loss"] = keras.losses.deserialize(config["loss"])
         return cls(**config)
 
 
 @keras_export("keras.distillation.FeatureDistillation")
-class FeatureDistillation(BaseDistillationStrategy):
+class FeatureDistillation(DistillationLoss):
     """Feature distillation strategy using intermediate layer representations.
 
     Feature distillation transfers knowledge from intermediate layers of the
@@ -330,9 +325,13 @@ class FeatureDistillation(BaseDistillationStrategy):
       rather than magnitude.
 
     Args:
-        loss_type: Type of loss function to use. Options:
-            - `"mse"`: Mean squared error between teacher and student features
-            - `"cosine"`: Cosine similarity between feature vectors
+        loss: Loss function(s) to use for feature distillation. Can be:
+            - String identifier (e.g., 'mse', 'cosine_similarity', 'mae')
+            - Keras loss instance
+            - List/tuple of losses for multi-output models
+            - Dict of losses for named outputs
+            The structure must match the model's output structure.
+            Defaults to 'mse'.
         teacher_layer_name: Name of the teacher layer to extract features from.
             If None, uses the final output. Defaults to None.
         student_layer_name: Name of the student layer to extract features from.
@@ -342,17 +341,37 @@ class FeatureDistillation(BaseDistillationStrategy):
 
     ```python
     # Basic feature distillation from final outputs
-    strategy = FeatureDistillation(loss_type="mse")
+    strategy = FeatureDistillation(loss="mse")
+
+    # With custom loss instance
+    strategy = FeatureDistillation(
+        loss=keras.losses.MeanAbsoluteError()
+    )
+
+    # For multi-output models with list structure
+    strategy = FeatureDistillation(
+        loss=["mse", "cosine_similarity"]
+    )
+
+    # For multi-output models with dict structure
+    strategy = FeatureDistillation(
+        loss={
+            "features_1": "mse",
+            "features_2": "cosine_similarity"
+        }
+    )
 
     # Custom loss by subclassing
     class CustomFeatureDistillation(FeatureDistillation):
         def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            # Use first output by default
-            teacher_features = teacher_outputs[0]
-            student_features = student_outputs[0]
-
-            # Custom L1 loss for feature distillation
-            return ops.mean(ops.abs(teacher_features - student_features))
+            # Apply loss using tree.map_structure
+            return tree.map_structure(
+                lambda t, s: keras.ops.mean(
+                    keras.ops.abs(t - s)
+                ),
+                teacher_outputs,
+                student_outputs
+            )
 
     strategy = CustomFeatureDistillation(
         teacher_layer_name="dense_1",
@@ -361,21 +380,21 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
     # Distill from specific layers with compatible shapes
     strategy = FeatureDistillation(
-        loss_type="mse",
+        loss="mse",
         teacher_layer_name="dense_1",
         student_layer_name="dense_1"
     )
 
     # Use cosine similarity for different feature sizes
     strategy = FeatureDistillation(
-        loss_type="cosine",
+        loss="cosine_similarity",
         teacher_layer_name="conv2d_2",
         student_layer_name="conv2d_1"
     )
 
     # Distill from final outputs (equivalent to logits distillation)
     strategy = FeatureDistillation(
-        loss_type="mse",
+        loss="mse",
         teacher_layer_name=None,  # Final output
         student_layer_name=None   # Final output
     )
@@ -383,9 +402,8 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
     """
 
     def __init__(
-        self, loss_type="mse", teacher_layer_name=None, student_layer_name=None
+        self, loss="mse", teacher_layer_name=None, student_layer_name=None
     ):
-        self.loss_type = loss_type
         self.teacher_layer_name = teacher_layer_name
         self.student_layer_name = student_layer_name
 
@@ -393,68 +411,99 @@ def __init__(
         self._teacher_feature_model = None
         self._student_feature_model = None
 
-        # Validate loss_type
-        valid_loss_types = ["mse", "cosine"]
-        if loss_type not in valid_loss_types:
-            raise ValueError(f"loss_type must be one of {valid_loss_types}")
+        # Convert loss structure to functions using tree.map_structure
+        def convert_loss_to_function(loss_item):
+            if isinstance(loss_item, str):
+                loss_fn = keras.losses.get(loss_item)
+                if loss_fn is None:
+                    raise ValueError(
+                        f"Unknown loss function: '{loss_item}'. "
+                        "Please provide a valid loss function name or instance."
+                    )
+                return loss_fn
+            else:
+                return loss_item
 
-    def _get_teacher_features(self, teacher_model, inputs):
-        """Extract features from teacher model."""
+        self.loss = tree.map_structure(convert_loss_to_function, loss)
+
+    def _get_features(
+        self, model, inputs, training, layer_name, feature_model_attr
+    ):
+        """Extract features from model at specified layer.
+
+        Args:
+            model: The model to extract features from.
+            inputs: Input data.
+            training: Whether model is in training mode.
+            layer_name: Name of layer to extract from (None for final output).
+            feature_model_attr: Attribute name to cache feature extraction
+                model.
+
+        Returns:
+            Extracted features.
+        """
         # No specific layer, use the final model output
-        if self.teacher_layer_name is None:
-            return teacher_model(inputs, training=False)
+        if layer_name is None:
+            return model(inputs, training=training)
 
-        # For intermediate layer extraction, we need to create a custom function
-        # that extracts the output at the specified layer
-        if self._teacher_feature_model is None:
-            # Build the model first if needed (for Sequential models)
+        # For intermediate layer extraction, create feature extractor if needed
+        if getattr(self, feature_model_attr) is None:
             try:
-                self._teacher_feature_model = self._create_feature_extractor(
-                    teacher_model, self.teacher_layer_name
+                setattr(
+                    self,
+                    feature_model_attr,
+                    self._create_feature_extractor(model, layer_name),
                 )
             except ValueError as e:
                 if "no defined inputs" in str(e).lower():
-                    # Build the model by calling it with the inputs first
-                    _ = teacher_model(inputs, training=False)
+                    # Build the model by calling it with inputs first
+                    _ = model(inputs, training=training)
                     # Now try again
-                    self._teacher_feature_model = (
-                        self._create_feature_extractor(
-                            teacher_model, self.teacher_layer_name
-                        )
+                    setattr(
+                        self,
+                        feature_model_attr,
+                        self._create_feature_extractor(model, layer_name),
                     )
                 else:
                     raise
 
-        return self._teacher_feature_model(inputs, training=False)
+        return getattr(self, feature_model_attr)(inputs, training=training)
 
-    def _get_student_features(self, student_model, inputs):
+    def get_teacher_features(self, teacher_model, inputs):
+        """Extract features from teacher model."""
+        return self._get_features(
+            teacher_model,
+            inputs,
+            False,
+            self.teacher_layer_name,
+            "_teacher_feature_model",
+        )
+
+    def get_student_features(self, student_model, inputs):
         """Extract features from student model."""
-        # No specific layer, use the final model output
-        if self.student_layer_name is None:
-            return student_model(inputs, training=True)
+        return self._get_features(
+            student_model,
+            inputs,
+            True,
+            self.student_layer_name,
+            "_student_feature_model",
+        )
 
-        # For intermediate layer extraction, we need to create a custom function
-        # that extracts the output at the specified layer
-        if self._student_feature_model is None:
-            # Build the model first if needed (for Sequential models)
+    def validate_model_compatibility(self, teacher, student):
+        """Validate that teacher and student models are compatible for feature
+        distillation."""
+        # Check if specified layers exist in the models
+        if self.teacher_layer_name is not None:
             try:
-                self._student_feature_model = self._create_feature_extractor(
-                    student_model, self.student_layer_name
-                )
+                teacher.get_layer(name=self.teacher_layer_name)
             except ValueError as e:
-                if "no defined inputs" in str(e).lower():
-                    # Build the model by calling it with the inputs first
-                    _ = student_model(inputs, training=True)
-                    # Now try again
-                    self._student_feature_model = (
-                        self._create_feature_extractor(
-                            student_model, self.student_layer_name
-                        )
-                    )
-                else:
-                    raise
+                raise ValueError(f"In teacher model: {e}")
 
-        return self._student_feature_model(inputs, training=True)
+        if self.student_layer_name is not None:
+            try:
+                student.get_layer(name=self.student_layer_name)
+            except ValueError as e:
+                raise ValueError(f"In student model: {e}")
 
     def _create_feature_extractor(self, model, layer_name):
         """Create a feature extractor function for the specified layer.
@@ -471,20 +520,12 @@ def _create_feature_extractor(self, model, layer_name):
             # Return the original model if no layer specified
             return model
 
-        # Find the layer by name
-        target_layer = None
-        for layer in model.layers:
-            if layer.name == layer_name:
-                target_layer = layer
-                break
-
-        if target_layer is None:
+        # Get the layer using Keras built-in method
+        try:
+            target_layer = model.get_layer(name=layer_name)
+        except ValueError as e:
             raise ValueError(
-                f"Layer '{layer_name}' not found in model. "
-                f"This may happen with a subclassed model that cannot be "
-                f"traversed using the standard layer API. "
-                f"Available layers: "
-                f"{[layer.name for layer in model.layers]}"
+                f"Layer '{layer_name}' not found in model '{model.name}'. {e}"
             )
 
         # Create a new model that extracts features from the specified layer.
@@ -523,28 +564,27 @@ def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
 
-        # Normalize outputs to lists
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
+        # Validate that loss structure matches output structure
+        try:
+            tree.assert_same_structure(self.loss, teacher_outputs)
+            tree.assert_same_structure(self.loss, student_outputs)
+        except ValueError as e:
+            raise ValueError(
+                f"Loss structure must match output structure. "
+                f"Loss structure: {tree.structure(self.loss)}, "
+                f"Teacher output structure: {tree.structure(teacher_outputs)}, "
+                f"Student output structure: {tree.structure(student_outputs)}. "
+                f"Error: {e}"
+            )
 
-        # For feature distillation, we need to validate layer compatibility
+        # For feature distillation, validate layer compatibility if specified
         if (
             self.teacher_layer_name is not None
             and self.student_layer_name is not None
         ):
             # Validate that the specified layers exist and are compatible
             self._validate_layer_compatibility(teacher_outputs, student_outputs)
-        else:
-            # If no specific layers are specified, validate final outputs
-            if len(teacher_outputs) != len(student_outputs):
-                raise ValueError(
-                    f"Teacher and student must have the same number of "
-                    f"outputs. "
-                    f"Teacher has {len(teacher_outputs)} outputs, "
-                    f"student has {len(student_outputs)} outputs."
-                )
+        # Note: Base class already validated output count compatibility
 
     def _validate_layer_compatibility(self, teacher_outputs, student_outputs):
         """Validate that the specified layers are compatible for feature
@@ -554,39 +594,6 @@ def _validate_layer_compatibility(self, teacher_outputs, student_outputs):
         # names
         pass
 
-    def validate_model_compatibility(self, teacher, student):
-        """Validate that teacher and student models are compatible for feature
-        distillation."""
-        # Check if specified layers exist in the models
-        if self.teacher_layer_name is not None:
-            if not self._layer_exists_in_model(
-                teacher, self.teacher_layer_name
-            ):
-                raise ValueError(
-                    f"Teacher layer '{self.teacher_layer_name}' not found in "
-                    f"teacher model. "
-                    f"Available layers: "
-                    f"{[layer.name for layer in teacher.layers]}"
-                )
-
-        if self.student_layer_name is not None:
-            if not self._layer_exists_in_model(
-                student, self.student_layer_name
-            ):
-                raise ValueError(
-                    f"Student layer '{self.student_layer_name}' not found in "
-                    f"student model. "
-                    f"Available layers: "
-                    f"{[layer.name for layer in student.layers]}"
-                )
-
-    def _layer_exists_in_model(self, model, layer_name):
-        """Check if a layer with the given name exists in the model."""
-        for layer in model.layers:
-            if layer.name == layer_name:
-                return True
-        return False
-
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute feature distillation loss using extracted features.
 
@@ -597,52 +604,44 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
         Args:
             teacher_outputs: Intermediate features from teacher model.
-                Can be a single tensor or a list/tuple of tensors.
+                Can be a single tensor, list/tuple of tensors, or dict of
+                tensors.
             student_outputs: Intermediate features from student model.
-                Can be a single tensor or a list/tuple of tensors.
+                Can be a single tensor, list/tuple of tensors, or dict of
+                tensors.
             **kwargs: Additional arguments (ignored).
         Returns:
             Feature distillation loss tensor.
         """
 
-        # Normalize outputs to lists
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
+        # Apply loss function(s) to corresponding features
+        def apply_loss(loss_fn, teacher_features, student_features):
+            loss = keras.ops.mean(loss_fn(teacher_features, student_features))
 
-        # Use first output by default (can be extended to use specific outputs)
-        teacher_features = teacher_outputs[0]
-        student_features = student_outputs[0]
-
-        if self.loss_type == "mse":
-            # Use Keras MeanSquaredError directly and reduce to scalar
-            loss = ops.mean(
-                keras.losses.mean_squared_error(
-                    teacher_features, student_features
-                )
-            )
+            # Special handling for cosine similarity (convert similarity to
+            # distance)
+            if (
+                hasattr(loss_fn, "__name__")
+                and "cosine" in loss_fn.__name__.lower()
+            ):
+                # Convert similarity to distance: distance = 1 - similarity
+                loss = 1.0 - loss
 
-        elif self.loss_type == "cosine":
-            # Use Keras CosineSimilarity directly (returns similarity, convert
-            # to distance)
-            similarity = ops.mean(
-                keras.losses.cosine_similarity(
-                    teacher_features, student_features
-                )
-            )
-            # Convert similarity to distance: distance = 1 - similarity
-            loss = 1.0 - similarity
+            return loss
 
-        else:
-            raise ValueError(f"Unknown loss_type: {self.loss_type}")
+        # Apply losses using tree.map_structure
+        loss_values = tree.map_structure(
+            apply_loss, self.loss, teacher_outputs, student_outputs
+        )
 
-        return loss
+        # Sum all losses and return scalar
+        flat_losses = tree.flatten(loss_values)
+        return keras.ops.sum(keras.ops.stack(flat_losses))
 
     def get_config(self):
         """Get configuration for serialization."""
         return {
-            "loss_type": self.loss_type,
+            "loss": keras.losses.serialize(self.loss),
             "teacher_layer_name": self.teacher_layer_name,
             "student_layer_name": self.student_layer_name,
         }
@@ -650,4 +649,6 @@ def get_config(self):
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
+        config = config.copy()
+        config["loss"] = keras.losses.deserialize(config["loss"])
         return cls(**config)
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/strategies_test.py
index ef01d3bb6a64..08c2390b9a6c 100644
--- a/keras/src/distillation/strategies_test.py
+++ b/keras/src/distillation/strategies_test.py
@@ -2,7 +2,6 @@
 import pytest
 
 import keras
-from keras import ops
 from keras.src.distillation.distiller import Distiller
 from keras.src.distillation.strategies import FeatureDistillation
 from keras.src.distillation.strategies import LogitsDistillation
@@ -50,10 +49,10 @@ def test_logits_distillation_end_to_end(self):
         strategy = LogitsDistillation(temperature=2.0)
 
         # Create dummy logits with sufficient difference to ensure non-zero loss
-        teacher_logits = ops.convert_to_tensor(
+        teacher_logits = keras.ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
-        student_logits = ops.convert_to_tensor(
+        student_logits = keras.ops.convert_to_tensor(
             np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
         )
 
@@ -64,32 +63,30 @@ def test_logits_distillation_end_to_end(self):
         self.assertEqual(len(loss.shape), 0)
 
         # Check that loss is finite and positive
-        self.assertTrue(ops.isfinite(loss))
+        self.assertTrue(keras.ops.isfinite(loss))
         self.assertGreater(loss, 0.0)
 
     def test_logits_distillation_with_different_loss_types(self):
         """Test logits distillation with different loss types."""
         # Test KL divergence
-        strategy_kl = LogitsDistillation(
-            temperature=2.0, loss_type="kl_divergence"
-        )
-        teacher_logits = ops.convert_to_tensor(
+        strategy_kl = LogitsDistillation(temperature=2.0, loss="kl_divergence")
+        teacher_logits = keras.ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0]]), dtype="float32"
         )
-        student_logits = ops.convert_to_tensor(
+        student_logits = keras.ops.convert_to_tensor(
             np.array([[2.0, 1.0, 4.0]]), dtype="float32"
         )
 
         loss_kl = strategy_kl.compute_loss(teacher_logits, student_logits)
-        self.assertTrue(ops.isfinite(loss_kl))
+        self.assertTrue(keras.ops.isfinite(loss_kl))
         self.assertGreater(loss_kl, 0.0)
 
         # Test categorical crossentropy
         strategy_ce = LogitsDistillation(
-            temperature=2.0, loss_type="categorical_crossentropy"
+            temperature=2.0, loss="categorical_crossentropy"
         )
         loss_ce = strategy_ce.compute_loss(teacher_logits, student_logits)
-        self.assertTrue(ops.isfinite(loss_ce))
+        self.assertTrue(keras.ops.isfinite(loss_ce))
         self.assertGreater(loss_ce, 0.0)
 
 
@@ -131,30 +128,30 @@ def test_feature_distillation_end_to_end(self):
 
         # Test MSE loss
         strategy_mse = FeatureDistillation(
-            loss_type="mse",
+            loss="mse",
             teacher_layer_name="teacher_dense_1",
             student_layer_name="student_dense_1",
         )
 
-        teacher_features = ops.convert_to_tensor(
+        teacher_features = keras.ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
-        student_features = ops.convert_to_tensor(
+        student_features = keras.ops.convert_to_tensor(
             np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
         )
 
         loss_mse = strategy_mse.compute_loss(teacher_features, student_features)
         self.assertEqual(len(loss_mse.shape), 0)
-        self.assertTrue(ops.isfinite(loss_mse))
+        self.assertTrue(keras.ops.isfinite(loss_mse))
         self.assertGreater(loss_mse, 0.0)
 
         # Test cosine loss
-        strategy_cosine = FeatureDistillation(loss_type="cosine")
+        strategy_cosine = FeatureDistillation(loss="cosine_similarity")
         loss_cosine = strategy_cosine.compute_loss(
             teacher_features, student_features
         )
         self.assertEqual(len(loss_cosine.shape), 0)
-        self.assertTrue(ops.isfinite(loss_cosine))
+        self.assertTrue(keras.ops.isfinite(loss_cosine))
         self.assertGreaterEqual(loss_cosine, 0.0)
 
 
@@ -164,35 +161,37 @@ class TestMultiStrategyDistillation(TestCase):
 
     def test_multi_strategy_distillation_end_to_end(self):
         """Test multi-strategy distillation end-to-end."""
-        # Create strategies for different outputs
-        logits_strategy = LogitsDistillation(temperature=2.0, output_index=0)
-        feature_strategy = FeatureDistillation(loss_type="mse")
 
         # Create dummy multi-output data with very different values
         teacher_outputs = [
-            ops.convert_to_tensor(
+            keras.ops.convert_to_tensor(
                 np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
             ),
-            ops.convert_to_tensor(
+            keras.ops.convert_to_tensor(
                 np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32"
             ),
         ]
         student_outputs = [
-            ops.convert_to_tensor(
+            keras.ops.convert_to_tensor(
                 np.array([[10.0, 20.0, 30.0], [40.0, 50.0, 60.0]]),
                 dtype="float32",
             ),
-            ops.convert_to_tensor(
+            keras.ops.convert_to_tensor(
                 np.array([[0.5, 0.6], [0.7, 0.8]]), dtype="float32"
             ),
         ]
 
-        # Test individual strategies
-        logits_loss = logits_strategy.compute_loss(
-            [teacher_outputs[0]], [student_outputs[0]]
+        # Test individual strategies with matching loss structures
+        logits_strategy_list = LogitsDistillation(
+            temperature=2.0, loss=["kl_divergence", "kl_divergence"]
+        )
+        feature_strategy_list = FeatureDistillation(loss=["mse", "mse"])
+
+        logits_loss = logits_strategy_list.compute_loss(
+            teacher_outputs, student_outputs
         )
-        feature_loss = feature_strategy.compute_loss(
-            [teacher_outputs[1]], [student_outputs[1]]
+        feature_loss = feature_strategy_list.compute_loss(
+            teacher_outputs, student_outputs
         )
 
         # Check that losses are scalar tensors
@@ -200,8 +199,8 @@ def test_multi_strategy_distillation_end_to_end(self):
         self.assertEqual(len(feature_loss.shape), 0)
 
         # Check that losses are finite and positive
-        self.assertTrue(ops.isfinite(logits_loss))
-        self.assertTrue(ops.isfinite(feature_loss))
+        self.assertTrue(keras.ops.isfinite(logits_loss))
+        self.assertTrue(keras.ops.isfinite(feature_loss))
         self.assertGreater(logits_loss, 0.0)
         self.assertGreater(feature_loss, 0.0)
 
@@ -218,8 +217,12 @@ def test_end_to_end_with_multi_output_models(self):
 
         # Create strategies list
         strategies = [
-            LogitsDistillation(temperature=2.0, output_index=0),
-            FeatureDistillation(loss_type="mse"),
+            LogitsDistillation(
+                temperature=2.0, loss=["kl_divergence", "kl_divergence"]
+            ),
+            FeatureDistillation(
+                loss=["mse", "mse"]
+            ),  # Match multi-output structure
         ]
         strategy_weights = [1.0, 0.5]
 

From 9c6a70cf84dbc27cca16cb9cb28fa91ebcad192c Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Tue, 2 Sep 2025 13:18:23 -0700
Subject: [PATCH 21/31] update file names

---
 keras/api/_tf_keras/keras/distillation/__init__.py        | 8 ++++----
 keras/api/distillation/__init__.py                        | 8 ++++----
 .../distillation/{strategies.py => distillation_loss.py}  | 0
 .../{strategies_test.py => distillation_loss_test.py}     | 0
 keras/src/distillation/distiller.py                       | 2 --
 5 files changed, 8 insertions(+), 10 deletions(-)
 rename keras/src/distillation/{strategies.py => distillation_loss.py} (100%)
 rename keras/src/distillation/{strategies_test.py => distillation_loss_test.py} (100%)

diff --git a/keras/api/_tf_keras/keras/distillation/__init__.py b/keras/api/_tf_keras/keras/distillation/__init__.py
index b1659fe83b6b..7f6fcd5bcc49 100644
--- a/keras/api/_tf_keras/keras/distillation/__init__.py
+++ b/keras/api/_tf_keras/keras/distillation/__init__.py
@@ -4,13 +4,13 @@
 since your modifications would be overwritten.
 """
 
-from keras.src.distillation.distiller import Distiller as Distiller
-from keras.src.distillation.strategies import (
+from keras.src.distillation.distillation_loss import (
     DistillationLoss as DistillationLoss,
 )
-from keras.src.distillation.strategies import (
+from keras.src.distillation.distillation_loss import (
     FeatureDistillation as FeatureDistillation,
 )
-from keras.src.distillation.strategies import (
+from keras.src.distillation.distillation_loss import (
     LogitsDistillation as LogitsDistillation,
 )
+from keras.src.distillation.distiller import Distiller as Distiller
diff --git a/keras/api/distillation/__init__.py b/keras/api/distillation/__init__.py
index b1659fe83b6b..7f6fcd5bcc49 100644
--- a/keras/api/distillation/__init__.py
+++ b/keras/api/distillation/__init__.py
@@ -4,13 +4,13 @@
 since your modifications would be overwritten.
 """
 
-from keras.src.distillation.distiller import Distiller as Distiller
-from keras.src.distillation.strategies import (
+from keras.src.distillation.distillation_loss import (
     DistillationLoss as DistillationLoss,
 )
-from keras.src.distillation.strategies import (
+from keras.src.distillation.distillation_loss import (
     FeatureDistillation as FeatureDistillation,
 )
-from keras.src.distillation.strategies import (
+from keras.src.distillation.distillation_loss import (
     LogitsDistillation as LogitsDistillation,
 )
+from keras.src.distillation.distiller import Distiller as Distiller
diff --git a/keras/src/distillation/strategies.py b/keras/src/distillation/distillation_loss.py
similarity index 100%
rename from keras/src/distillation/strategies.py
rename to keras/src/distillation/distillation_loss.py
diff --git a/keras/src/distillation/strategies_test.py b/keras/src/distillation/distillation_loss_test.py
similarity index 100%
rename from keras/src/distillation/strategies_test.py
rename to keras/src/distillation/distillation_loss_test.py
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index e8d5b3d906d1..dfe28a28db55 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -714,8 +714,6 @@ def compute_loss(self, x=None, y=None, y_pred=None,
 
                     # Custom distillation loss computation
                     teacher_outputs = self.teacher(x, training=False)
-                    # Use y_pred (student output from forward pass) instead of
-                    # recomputing
                     student_outputs = y_pred
 
                     # Custom loss logic here

From a7d0b54e8913745d7dc6a23b68848f30498e1b34 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Tue, 2 Sep 2025 18:01:28 -0700
Subject: [PATCH 22/31] subclass logits distillation loss from feature
 distillation loss

---
 keras/src/distillation/distillation_loss.py   | 428 ++++++++----------
 .../distillation/distillation_loss_test.py    | 367 ++++++++-------
 keras/src/distillation/distiller_test.py      |   2 +-
 3 files changed, 387 insertions(+), 410 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index 814f3a87db4a..58d55c1f8034 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -61,219 +61,6 @@ def validate_outputs(self, teacher_outputs, student_outputs):
             )
 
 
-@keras_export("keras.distillation.LogitsDistillation")
-class LogitsDistillation(DistillationLoss):
-    """Distillation strategy that transfers knowledge from final model outputs.
-
-    This strategy applies temperature scaling to the teacher's logits before
-    computing the loss between teacher and student predictions. It's the most
-    common approach for knowledge distillation.
-
-    How Logits Distillation Works:
-
-    1. Temperature Scaling: The teacher's logits are divided by a `temperature`
-       parameter (typically 3-5) before applying softmax. This creates "softer"
-       probability distributions that reveal relationships between classes.
-
-    2. Loss Computation: The loss is computed between the temperature-scaled
-       teacher logits and student logits using the specified loss function.
-
-    When to Use Logits Distillation:
-
-    - General Classification: Works well for most classification tasks
-    - Model Compression: Effective for reducing model size while maintaining
-      accuracy
-    - Transfer Learning: Good for leveraging knowledge from pre-trained models
-    - Ensemble Distillation: Can combine multiple teacher models
-
-    Temperature Guidelines:
-
-    - Low Temperature (1-2): Sharp distributions, similar to hard labels
-    - Medium Temperature (3-5): Balanced softness, most commonly used
-    - High Temperature (6-10): Very soft distributions, reveals subtle
-      relationships
-
-    Args:
-        temperature: Temperature for softmax scaling. Higher values produce
-            softer probability distributions that are easier for the student to
-            learn. Typical values range from 3-5. Defaults to 3.0.
-        loss: Loss function(s) to use for distillation. Can be:
-            - String identifier (e.g., 'kl_divergence',
-              'categorical_crossentropy')
-            - Keras loss instance
-            - List/tuple of losses for multi-output models
-            - Dict of losses for named outputs
-            The structure must match the model's output structure.
-            Defaults to 'kl_divergence'.
-
-    Example:
-
-    ```python
-    # Basic logits distillation with KL divergence
-    strategy = LogitsDistillation(temperature=3.0)
-
-    # With categorical crossentropy loss
-    strategy = LogitsDistillation(
-        temperature=4.0,
-        loss="categorical_crossentropy"
-    )
-
-    # With custom loss instance
-    strategy = LogitsDistillation(
-        temperature=4.0,
-        loss=keras.losses.CategoricalCrossentropy(from_logits=True)
-    )
-
-    # For multi-output models with list structure
-    strategy = LogitsDistillation(
-        temperature=3.0,
-        loss=["kl_divergence", "categorical_crossentropy"]
-    )
-
-    # For multi-output models with dict structure
-    strategy = LogitsDistillation(
-        temperature=3.0,
-        loss={
-            "classification": "kl_divergence",
-            "regression": "mse"
-        }
-    )
-
-    # Custom loss by subclassing
-    class CustomLogitsDistillation(LogitsDistillation):
-        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            # Apply temperature scaling using tree.map_structure
-            teacher_scaled = tree.map_structure(
-                lambda x: x / self.temperature, teacher_outputs
-            )
-            student_scaled = tree.map_structure(
-                lambda x: x / self.temperature, student_outputs
-            )
-
-            # Custom loss computation
-            return tree.map_structure(
-                lambda t, s: keras.ops.mean(
-                    keras.losses.kl_divergence(
-                        keras.ops.softmax(t, axis=-1),
-                        keras.ops.softmax(s, axis=-1)
-                    )
-                ),
-                teacher_scaled,
-                student_scaled
-            )
-    ```
-    """
-
-    def __init__(
-        self,
-        temperature=3.0,
-        loss="kl_divergence",
-    ):
-        super().__init__()
-        self.temperature = temperature
-
-        # Convert loss structure to functions using tree.map_structure
-        def convert_loss_to_function(loss_item):
-            if isinstance(loss_item, str):
-                loss_fn = keras.losses.get(loss_item)
-                if loss_fn is None:
-                    raise ValueError(
-                        f"Unknown loss function: '{loss_item}'. "
-                        "Please provide a valid loss function name or instance."
-                    )
-                return loss_fn
-            else:
-                return loss_item
-
-        self.loss = tree.map_structure(convert_loss_to_function, loss)
-
-        # Validate temperature
-        if not isinstance(self.temperature, (int, float)):
-            raise ValueError(
-                f"temperature must be a number, got {type(self.temperature)}"
-            )
-        if self.temperature <= 0.0:
-            raise ValueError(
-                "temperature must be > 0. Set a positive value (e.g., 1-10)."
-            )
-
-    def validate_outputs(self, teacher_outputs, student_outputs):
-        """Validate that outputs are compatible for logits distillation."""
-        super().validate_outputs(teacher_outputs, student_outputs)
-
-        # Validate that loss structure matches output structure
-        try:
-            tree.assert_same_structure(self.loss, teacher_outputs)
-            tree.assert_same_structure(self.loss, student_outputs)
-        except ValueError as e:
-            raise ValueError(
-                f"Loss structure must match output structure. "
-                f"Loss structure: {tree.structure(self.loss)}, "
-                f"Teacher output structure: {tree.structure(teacher_outputs)}, "
-                f"Student output structure: {tree.structure(student_outputs)}. "
-                f"Error: {e}"
-            )
-
-    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-        """Compute distillation loss using the configured loss function.
-
-        Args:
-            teacher_outputs: Logits from teacher model. Can be a single tensor,
-                list/tuple of tensors, or dict of tensors.
-            student_outputs: Logits from student model. Can be a single tensor,
-                list/tuple of tensors, or dict of tensors.
-            **kwargs: Additional arguments (ignored).
-        Returns:
-            Distillation loss tensor.
-        """
-        # Apply temperature scaling using tree.map_structure
-        teacher_scaled = tree.map_structure(
-            lambda x: x / self.temperature, teacher_outputs
-        )
-        student_scaled = tree.map_structure(
-            lambda x: x / self.temperature, student_outputs
-        )
-
-        # Apply loss function(s) to corresponding outputs
-        def apply_loss(loss_fn, teacher_logits, student_logits):
-            # Special handling for KL divergence (needs probabilities)
-            if (
-                hasattr(loss_fn, "__name__")
-                and "kl" in loss_fn.__name__.lower()
-            ):
-                teacher_probs = keras.ops.softmax(teacher_logits, axis=-1)
-                student_probs = keras.ops.softmax(student_logits, axis=-1)
-                loss = keras.ops.mean(loss_fn(teacher_probs, student_probs))
-                # Scale by temperature^2 for KL (per literature)
-                return loss * (self.temperature**2)
-            else:
-                # For other losses, use logits directly
-                return keras.ops.mean(loss_fn(teacher_logits, student_logits))
-
-        # Apply losses using tree.map_structure
-        loss_values = tree.map_structure(
-            apply_loss, self.loss, teacher_scaled, student_scaled
-        )
-
-        # Sum all losses and return scalar
-        flat_losses = tree.flatten(loss_values)
-        return keras.ops.sum(keras.ops.stack(flat_losses))
-
-    def get_config(self):
-        """Get configuration for serialization."""
-        return {
-            "temperature": self.temperature,
-            "loss": keras.losses.serialize(self.loss),
-        }
-
-    @classmethod
-    def from_config(cls, config):
-        """Create instance from configuration."""
-        config = config.copy()
-        config["loss"] = keras.losses.deserialize(config["loss"])
-        return cls(**config)
-
-
 @keras_export("keras.distillation.FeatureDistillation")
 class FeatureDistillation(DistillationLoss):
     """Feature distillation strategy using intermediate layer representations.
@@ -469,26 +256,6 @@ def _get_features(
 
         return getattr(self, feature_model_attr)(inputs, training=training)
 
-    def get_teacher_features(self, teacher_model, inputs):
-        """Extract features from teacher model."""
-        return self._get_features(
-            teacher_model,
-            inputs,
-            False,
-            self.teacher_layer_name,
-            "_teacher_feature_model",
-        )
-
-    def get_student_features(self, student_model, inputs):
-        """Extract features from student model."""
-        return self._get_features(
-            student_model,
-            inputs,
-            True,
-            self.student_layer_name,
-            "_student_feature_model",
-        )
-
     def validate_model_compatibility(self, teacher, student):
         """Validate that teacher and student models are compatible for feature
         distillation."""
@@ -651,4 +418,199 @@ def from_config(cls, config):
         """Create instance from configuration."""
         config = config.copy()
         config["loss"] = keras.losses.deserialize(config["loss"])
+
+        # Filter out parameters that LogitsDistillation doesn't accept
+        # (inherited from FeatureDistillation's get_config)
+        config.pop("teacher_layer_name", None)
+        config.pop("student_layer_name", None)
+
+        return cls(**config)
+
+
+@keras_export("keras.distillation.LogitsDistillation")
+class LogitsDistillation(FeatureDistillation):
+    """Distillation strategy that transfers knowledge from final model outputs.
+
+    This strategy applies temperature scaling to the teacher's logits before
+    computing the loss between teacher and student predictions. It's the most
+    common approach for knowledge distillation.
+
+    How Logits Distillation Works:
+
+    1. Temperature Scaling: The teacher's logits are divided by a `temperature`
+       parameter (typically 3-5) before applying softmax. This creates "softer"
+       probability distributions that reveal relationships between classes.
+
+    2. Loss Computation: The loss is computed between the temperature-scaled
+       teacher logits and student logits using the specified loss function.
+
+    When to Use Logits Distillation:
+
+    - General Classification: Works well for most classification tasks
+    - Model Compression: Effective for reducing model size while maintaining
+      accuracy
+    - Transfer Learning: Good for leveraging knowledge from pre-trained models
+    - Ensemble Distillation: Can combine multiple teacher models
+
+    Temperature Guidelines:
+
+    - Low Temperature (1-2): Sharp distributions, similar to hard labels
+    - Medium Temperature (3-5): Balanced softness, most commonly used
+    - High Temperature (6-10): Very soft distributions, reveals subtle
+      relationships
+
+    Args:
+        temperature: Temperature for softmax scaling. Higher values produce
+            softer probability distributions that are easier for the student to
+            learn. Typical values range from 3-5. Defaults to 3.0.
+        loss: Loss function(s) to use for distillation. Can be:
+            - String identifier (e.g., 'kl_divergence',
+              'categorical_crossentropy')
+            - Keras loss instance
+            - List/tuple of losses for multi-output models
+            - Dict of losses for named outputs
+            The structure must match the model's output structure.
+            Defaults to 'kl_divergence'.
+
+    Example:
+
+    ```python
+    # Basic logits distillation with KL divergence
+    strategy = LogitsDistillation(temperature=3.0)
+
+    # With categorical crossentropy loss
+    strategy = LogitsDistillation(
+        temperature=4.0,
+        loss="categorical_crossentropy"
+    )
+
+    # With custom loss instance
+    strategy = LogitsDistillation(
+        temperature=4.0,
+        loss=keras.losses.CategoricalCrossentropy(from_logits=True)
+    )
+
+    # For multi-output models with list structure
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss=["kl_divergence", "categorical_crossentropy"]
+    )
+
+    # For multi-output models with dict structure
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss={
+            "classification": "kl_divergence",
+            "regression": "mse"
+        }
+    )
+
+    # Custom loss by subclassing
+    class CustomLogitsDistillation(LogitsDistillation):
+        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+            # Apply temperature scaling using tree.map_structure
+            teacher_scaled = tree.map_structure(
+                lambda x: x / self.temperature, teacher_outputs
+            )
+            student_scaled = tree.map_structure(
+                lambda x: x / self.temperature, student_outputs
+            )
+
+            # Custom loss computation
+            return tree.map_structure(
+                lambda t, s: keras.ops.mean(
+                    keras.losses.kl_divergence(
+                        keras.ops.softmax(t, axis=-1),
+                        keras.ops.softmax(s, axis=-1)
+                    )
+                ),
+                teacher_scaled,
+                student_scaled
+            )
+    ```
+    """
+
+    def __init__(
+        self,
+        temperature=3.0,
+        loss="kl_divergence",
+    ):
+        # Always use final outputs (no intermediate layers)
+        super().__init__(
+            loss=loss, teacher_layer_name=None, student_layer_name=None
+        )
+        self.temperature = temperature
+
+        # Validate temperature
+        if not isinstance(self.temperature, (int, float)):
+            raise ValueError(
+                f"temperature must be a number, got {type(self.temperature)}"
+            )
+        if self.temperature <= 0.0:
+            raise ValueError(
+                "temperature must be > 0. Set a positive value (e.g., 1-10)."
+            )
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+        """Compute distillation loss using the configured loss function.
+
+        Args:
+            teacher_outputs: Logits from teacher model. Can be a single tensor,
+                list/tuple of tensors, or dict of tensors.
+            student_outputs: Logits from student model. Can be a single tensor,
+                list/tuple of tensors, or dict of tensors.
+            **kwargs: Additional arguments (ignored).
+        Returns:
+            Distillation loss tensor.
+        """
+        # Apply temperature scaling using tree.map_structure
+        teacher_scaled = tree.map_structure(
+            lambda x: x / self.temperature, teacher_outputs
+        )
+        student_scaled = tree.map_structure(
+            lambda x: x / self.temperature, student_outputs
+        )
+
+        # Apply loss function(s) to corresponding outputs
+        def apply_loss(loss_fn, teacher_logits, student_logits):
+            # Special handling for KL divergence (needs probabilities)
+            if (
+                hasattr(loss_fn, "__name__")
+                and "kl" in loss_fn.__name__.lower()
+            ):
+                teacher_probs = keras.ops.softmax(teacher_logits, axis=-1)
+                student_probs = keras.ops.softmax(student_logits, axis=-1)
+                loss = keras.ops.mean(loss_fn(teacher_probs, student_probs))
+                # Scale by temperature^2 for KL (per literature)
+                return loss * (self.temperature**2)
+            else:
+                # For other losses, use logits directly
+                return keras.ops.mean(loss_fn(teacher_logits, student_logits))
+
+        # Apply losses using tree.map_structure
+        loss_values = tree.map_structure(
+            apply_loss, self.loss, teacher_scaled, student_scaled
+        )
+
+        # Sum all losses and return scalar
+        flat_losses = tree.flatten(loss_values)
+        return keras.ops.sum(keras.ops.stack(flat_losses))
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        config = super().get_config()
+        config["temperature"] = self.temperature
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        config = config.copy()
+        config["loss"] = keras.losses.deserialize(config["loss"])
+
+        # Filter out parameters that LogitsDistillation doesn't accept
+        # (inherited from FeatureDistillation's get_config)
+        config.pop("teacher_layer_name", None)
+        config.pop("student_layer_name", None)
+
         return cls(**config)
diff --git a/keras/src/distillation/distillation_loss_test.py b/keras/src/distillation/distillation_loss_test.py
index 08c2390b9a6c..969034e53741 100644
--- a/keras/src/distillation/distillation_loss_test.py
+++ b/keras/src/distillation/distillation_loss_test.py
@@ -2,53 +2,21 @@
 import pytest
 
 import keras
+from keras.src.distillation.distillation_loss import FeatureDistillation
+from keras.src.distillation.distillation_loss import LogitsDistillation
 from keras.src.distillation.distiller import Distiller
-from keras.src.distillation.strategies import FeatureDistillation
-from keras.src.distillation.strategies import LogitsDistillation
 from keras.src.testing import TestCase
 
 
-class MultiOutputTeacher(keras.Model):
-    """Multi-output teacher model for testing."""
-
-    def __init__(self, vocab_size=10, hidden_dim=32):
-        super().__init__()
-        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
-        self.dense2 = keras.layers.Dense(vocab_size)
-        self.dense3 = keras.layers.Dense(5)
-
-    def call(self, inputs, training=None):
-        x = self.dense1(inputs)
-        output1 = self.dense2(x)
-        output2 = self.dense3(x)
-        return [output1, output2]
-
-
-class MultiOutputStudent(keras.Model):
-    """Multi-output student model for testing."""
-
-    def __init__(self, vocab_size=10, hidden_dim=16):
-        super().__init__()
-        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
-        self.dense2 = keras.layers.Dense(vocab_size)
-        self.dense3 = keras.layers.Dense(5)
-
-    def call(self, inputs, training=None):
-        x = self.dense1(inputs)
-        output1 = self.dense2(x)
-        output2 = self.dense3(x)
-        return [output1, output2]
-
-
 @pytest.mark.requires_trainable_backend
 class TestLogitsDistillation(TestCase):
-    """Essential test cases for LogitsDistillation strategy."""
+    """Test cases for LogitsDistillation strategy."""
 
-    def test_logits_distillation_end_to_end(self):
-        """Test logits distillation loss computation end-to-end."""
+    def test_logits_distillation_basic(self):
+        """Test basic logits distillation loss computation."""
         strategy = LogitsDistillation(temperature=2.0)
 
-        # Create dummy logits with sufficient difference to ensure non-zero loss
+        # Create dummy logits
         teacher_logits = keras.ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
@@ -61,42 +29,42 @@ def test_logits_distillation_end_to_end(self):
 
         # Check that loss is a scalar tensor
         self.assertEqual(len(loss.shape), 0)
-
-        # Check that loss is finite and positive
         self.assertTrue(keras.ops.isfinite(loss))
         self.assertGreater(loss, 0.0)
 
-    def test_logits_distillation_with_different_loss_types(self):
-        """Test logits distillation with different loss types."""
-        # Test KL divergence
-        strategy_kl = LogitsDistillation(temperature=2.0, loss="kl_divergence")
-        teacher_logits = keras.ops.convert_to_tensor(
-            np.array([[1.0, 2.0, 3.0]]), dtype="float32"
+
+@pytest.mark.requires_trainable_backend
+class TestFeatureDistillation(TestCase):
+    """Test cases for FeatureDistillation strategy."""
+
+    def test_feature_distillation_basic(self):
+        """Test basic feature distillation loss computation."""
+        strategy = FeatureDistillation(loss="mse")
+
+        # Create dummy features
+        teacher_features = keras.ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
         )
-        student_logits = keras.ops.convert_to_tensor(
-            np.array([[2.0, 1.0, 4.0]]), dtype="float32"
+        student_features = keras.ops.convert_to_tensor(
+            np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
         )
 
-        loss_kl = strategy_kl.compute_loss(teacher_logits, student_logits)
-        self.assertTrue(keras.ops.isfinite(loss_kl))
-        self.assertGreater(loss_kl, 0.0)
+        # Compute loss
+        loss = strategy.compute_loss(teacher_features, student_features)
 
-        # Test categorical crossentropy
-        strategy_ce = LogitsDistillation(
-            temperature=2.0, loss="categorical_crossentropy"
-        )
-        loss_ce = strategy_ce.compute_loss(teacher_logits, student_logits)
-        self.assertTrue(keras.ops.isfinite(loss_ce))
-        self.assertGreater(loss_ce, 0.0)
+        # Check that loss is a scalar tensor
+        self.assertEqual(len(loss.shape), 0)
+        self.assertTrue(keras.ops.isfinite(loss))
+        self.assertGreater(loss, 0.0)
 
 
 @pytest.mark.requires_trainable_backend
-class TestFeatureDistillation(TestCase):
-    """Essential test cases for FeatureDistillation strategy."""
+class TestEndToEndDistillation(TestCase):
+    """End-to-end distillation tests with real models."""
 
-    def test_feature_distillation_end_to_end(self):
-        """Test feature distillation end-to-end."""
-        # Create models with named layers for feature extraction
+    def test_logits_distillation_end_to_end(self):
+        """Test end-to-end logits distillation with real models."""
+        # Create teacher model (larger)
         teacher = keras.Sequential(
             [
                 keras.layers.Dense(
@@ -105,10 +73,13 @@ def test_feature_distillation_end_to_end(self):
                 keras.layers.Dense(
                     32, activation="relu", name="teacher_dense_2"
                 ),
-                keras.layers.Dense(10, name="teacher_output"),
+                keras.layers.Dense(
+                    10, activation="softmax", name="teacher_output"
+                ),
             ]
         )
 
+        # Create student model (smaller)
         student = keras.Sequential(
             [
                 keras.layers.Dense(
@@ -117,155 +88,199 @@ def test_feature_distillation_end_to_end(self):
                 keras.layers.Dense(
                     16, activation="relu", name="student_dense_2"
                 ),
-                keras.layers.Dense(10, name="student_output"),
+                keras.layers.Dense(
+                    10, activation="softmax", name="student_output"
+                ),
             ]
         )
 
-        # Build models
-        dummy_input = np.random.random((1, 10)).astype(np.float32)
-        _ = teacher(dummy_input)
-        _ = student(dummy_input)
-
-        # Test MSE loss
-        strategy_mse = FeatureDistillation(
-            loss="mse",
-            teacher_layer_name="teacher_dense_1",
-            student_layer_name="student_dense_1",
-        )
-
-        teacher_features = keras.ops.convert_to_tensor(
-            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
-        )
-        student_features = keras.ops.convert_to_tensor(
-            np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategy=LogitsDistillation(temperature=3.0),
+            student_loss_weight=0.5,
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            student_loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
-        loss_mse = strategy_mse.compute_loss(teacher_features, student_features)
-        self.assertEqual(len(loss_mse.shape), 0)
-        self.assertTrue(keras.ops.isfinite(loss_mse))
-        self.assertGreater(loss_mse, 0.0)
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
 
-        # Test cosine loss
-        strategy_cosine = FeatureDistillation(loss="cosine_similarity")
-        loss_cosine = strategy_cosine.compute_loss(
-            teacher_features, student_features
-        )
-        self.assertEqual(len(loss_cosine.shape), 0)
-        self.assertTrue(keras.ops.isfinite(loss_cosine))
-        self.assertGreaterEqual(loss_cosine, 0.0)
+        # Test training
+        history = distiller.fit(x, y, epochs=2, verbose=0)
 
+        # Verify training completed
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
 
-@pytest.mark.requires_trainable_backend
-class TestMultiStrategyDistillation(TestCase):
-    """Essential test cases for multi-strategy distillation."""
+        # Verify loss values are reasonable
+        final_loss = history.history["total_loss"][-1]
+        self.assertTrue(np.isfinite(final_loss))
+        self.assertGreater(final_loss, 0.0)
 
-    def test_multi_strategy_distillation_end_to_end(self):
-        """Test multi-strategy distillation end-to-end."""
+        # Test prediction
+        predictions = distiller.predict(x[:5], verbose=0)
+        self.assertEqual(predictions.shape, (5, 10))
 
-        # Create dummy multi-output data with very different values
-        teacher_outputs = [
-            keras.ops.convert_to_tensor(
-                np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
-            ),
-            keras.ops.convert_to_tensor(
-                np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32"
-            ),
-        ]
-        student_outputs = [
-            keras.ops.convert_to_tensor(
-                np.array([[10.0, 20.0, 30.0], [40.0, 50.0, 60.0]]),
-                dtype="float32",
-            ),
-            keras.ops.convert_to_tensor(
-                np.array([[0.5, 0.6], [0.7, 0.8]]), dtype="float32"
-            ),
-        ]
+        # Test student model access
+        student_model = distiller.student_model
+        self.assertIsInstance(student_model, keras.Model)
 
-        # Test individual strategies with matching loss structures
-        logits_strategy_list = LogitsDistillation(
-            temperature=2.0, loss=["kl_divergence", "kl_divergence"]
+    def test_feature_distillation_end_to_end(self):
+        """Test end-to-end feature distillation with real models."""
+        # Create teacher model
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
         )
-        feature_strategy_list = FeatureDistillation(loss=["mse", "mse"])
 
-        logits_loss = logits_strategy_list.compute_loss(
-            teacher_outputs, student_outputs
+        # Create student model with compatible intermediate layer sizes
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
         )
-        feature_loss = feature_strategy_list.compute_loss(
-            teacher_outputs, student_outputs
+
+        # Build models first
+        dummy_input = np.random.random((2, 20)).astype(np.float32)
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create distiller with feature distillation
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategy=FeatureDistillation(
+                loss="mse",
+                teacher_layer_name="teacher_dense_1",
+                student_layer_name="student_dense_1",
+            ),
+            student_loss_weight=0.5,
+            optimizer=keras.optimizers.Adam(learning_rate=0.01),
+            student_loss="sparse_categorical_crossentropy",
         )
 
-        # Check that losses are scalar tensors
-        self.assertEqual(len(logits_loss.shape), 0)
-        self.assertEqual(len(feature_loss.shape), 0)
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
 
-        # Check that losses are finite and positive
-        self.assertTrue(keras.ops.isfinite(logits_loss))
-        self.assertTrue(keras.ops.isfinite(feature_loss))
-        self.assertGreater(logits_loss, 0.0)
-        self.assertGreater(feature_loss, 0.0)
+        # Test training
+        history = distiller.fit(x, y, epochs=2, verbose=0)
 
-    def test_end_to_end_with_multi_output_models(self):
-        """Test end-to-end training with multi-output models."""
+        # Verify training completed
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
 
-        # Create multi-output models
-        teacher = MultiOutputTeacher(vocab_size=10, hidden_dim=32)
-        student = MultiOutputStudent(vocab_size=10, hidden_dim=16)
+        # Verify feature extraction worked
+        self.assertIsNotNone(distiller._teacher_feature_extractor)
+        self.assertIsNotNone(distiller._student_feature_extractor)
 
-        # Build models before creating the distiller
-        teacher.build((None, 5))
-        student.build((None, 5))
+        # Test that feature extractors have correct outputs
+        self.assertEqual(
+            len(distiller._teacher_feature_extractor.outputs), 2
+        )  # final + dense_1
+        self.assertEqual(
+            len(distiller._student_feature_extractor.outputs), 2
+        )  # final + dense_1
 
-        # Create strategies list
+    def test_multi_strategy_distillation_end_to_end(self):
+        """Test end-to-end distillation with multiple strategies."""
+        # Create models
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
+        )
+
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
+        )
+
+        # Build models first
+        dummy_input = np.random.random((2, 20)).astype(np.float32)
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create multiple strategies
         strategies = [
-            LogitsDistillation(
-                temperature=2.0, loss=["kl_divergence", "kl_divergence"]
+            LogitsDistillation(temperature=3.0),
+            FeatureDistillation(
+                loss="mse",
+                teacher_layer_name="teacher_dense_1",
+                student_layer_name="student_dense_1",
             ),
             FeatureDistillation(
-                loss=["mse", "mse"]
-            ),  # Match multi-output structure
+                loss="mse",
+                teacher_layer_name="teacher_dense_2",
+                student_layer_name="student_dense_2",
+            ),
         ]
-        strategy_weights = [1.0, 0.5]
 
-        # Create distiller with strategies list
+        # Create distiller
         distiller = Distiller(
             teacher=teacher,
             student=student,
             strategies=strategies,
-            strategy_weights=strategy_weights,
+            strategy_weights=[1.0, 0.5, 0.3],
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss=[
-                "sparse_categorical_crossentropy",
-                "sparse_categorical_crossentropy",
-            ],
-            metrics=[
-                ["accuracy"],  # Metrics for output 0
-                ["accuracy"],  # Metrics for output 1
-            ],
+            student_loss="sparse_categorical_crossentropy",
         )
 
-        # Create test data for multi-output model
-        x = np.random.random((20, 5)).astype(np.float32)
-        # Multi-output targets: [output1_targets, output2_targets]
-        y = [
-            np.random.randint(0, 10, (20,)).astype(
-                np.int32
-            ),  # For output1 (10 classes)
-            np.random.randint(0, 5, (20,)).astype(
-                np.int32
-            ),  # For output2 (5 classes)
-        ]
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
 
-        # Test that training works
-        history = distiller.fit(x, y, epochs=1, verbose=0)
+        # Test training
+        history = distiller.fit(x, y, epochs=2, verbose=0)
 
-        # Check that training completed
+        # Verify training completed
         self.assertIn("total_loss", history.history)
         self.assertIn("student_loss", history.history)
         self.assertIn("distillation_loss", history.history)
 
-        # Test prediction
-        predictions = distiller.predict(x[:5], verbose=0)
-        self.assertEqual(
-            predictions[0].shape, (5, 10)
-        )  # Should return first output
+        # Verify efficient feature extraction
+        self.assertIsNotNone(distiller._teacher_feature_extractor)
+        self.assertIsNotNone(distiller._student_feature_extractor)
+
+        # Should have 3 outputs: final + dense_1 + dense_2
+        self.assertEqual(len(distiller._teacher_feature_extractor.outputs), 3)
+        self.assertEqual(len(distiller._student_feature_extractor.outputs), 3)
+
+        # Test that loss decreases (learning is happening)
+        initial_loss = history.history["total_loss"][0]
+        final_loss = history.history["total_loss"][-1]
+        self.assertTrue(np.isfinite(initial_loss))
+        self.assertTrue(np.isfinite(final_loss))
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index da72a8ddb604..7683748ab8f7 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -6,8 +6,8 @@
 import pytest
 
 import keras
+from keras.src.distillation.distillation_loss import LogitsDistillation
 from keras.src.distillation.distiller import Distiller
-from keras.src.distillation.strategies import LogitsDistillation
 from keras.src.testing import TestCase
 
 

From 1607807170eea92e2b3fae201117580dea91b6a5 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Tue, 2 Sep 2025 18:16:45 -0700
Subject: [PATCH 23/31] update docstrings

---
 keras/src/distillation/distillation_loss.py | 168 ++---------------
 keras/src/distillation/distiller.py         | 191 +++-----------------
 2 files changed, 41 insertions(+), 318 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index 58d55c1f8034..02eaa6544cf9 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -70,54 +70,12 @@ class FeatureDistillation(DistillationLoss):
     helps the student learn better internal representations and often leads
     to better performance compared to logits-only distillation.
 
-    How Feature Distillation Works:
-
-    1. Layer Selection: Specify which intermediate layers from teacher and
-       student models to use for distillation. These layers should have
-       compatible architectures or similar semantic meaning.
-
-    2. Feature Extraction: Extract activations from the specified layers
-       during forward pass. The teacher features are computed with
-       `training=False` (frozen), while student features are computed with
-       `training=True`.
-
-    3. Loss Computation: Compute loss between teacher and student features
-       using either MSE (for identical shapes) or cosine similarity (for
-       different shapes).
-
-    When to Use Feature Distillation:
-
-    - Similar Architectures: When teacher and student have similar layer
-      structures (e.g., both are CNNs with similar depths)
-    - Performance Improvement: Often leads to better student performance
-      than logits-only distillation
-    - Representation Learning: Helps student learn better internal features
-    - Multi-Scale Distillation: Can distill features from multiple layers
-      simultaneously
-
-    Layer Selection Guidelines:
-
-    - Early Layers: Capture low-level features (edges, textures)
-    - Middle Layers: Capture mid-level features (shapes, patterns)
-    - Late Layers: Capture high-level features (semantic concepts)
-    - Compatible Sizes: Choose layers with similar output dimensions
-    - Semantic Alignment: Match layers that serve similar functions
-
-    Loss Type Selection:
-
-    - `"mse"`: Use when teacher and student features have identical shapes.
-      Provides direct feature matching.
-    - `"cosine"`: Use when features have different shapes but
-      same feature dimension (last axis). Focuses on feature direction
-      rather than magnitude.
-
     Args:
-        loss: Loss function(s) to use for feature distillation. Can be:
+        loss: Loss function to use for feature distillation. Can be:
             - String identifier (e.g., 'mse', 'cosine_similarity', 'mae')
             - Keras loss instance
             - List/tuple of losses for multi-output models
             - Dict of losses for named outputs
-            The structure must match the model's output structure.
             Defaults to 'mse'.
         teacher_layer_name: Name of the teacher layer to extract features from.
             If None, uses the final output. Defaults to None.
@@ -130,42 +88,7 @@ class FeatureDistillation(DistillationLoss):
     # Basic feature distillation from final outputs
     strategy = FeatureDistillation(loss="mse")
 
-    # With custom loss instance
-    strategy = FeatureDistillation(
-        loss=keras.losses.MeanAbsoluteError()
-    )
-
-    # For multi-output models with list structure
-    strategy = FeatureDistillation(
-        loss=["mse", "cosine_similarity"]
-    )
-
-    # For multi-output models with dict structure
-    strategy = FeatureDistillation(
-        loss={
-            "features_1": "mse",
-            "features_2": "cosine_similarity"
-        }
-    )
-
-    # Custom loss by subclassing
-    class CustomFeatureDistillation(FeatureDistillation):
-        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            # Apply loss using tree.map_structure
-            return tree.map_structure(
-                lambda t, s: keras.ops.mean(
-                    keras.ops.abs(t - s)
-                ),
-                teacher_outputs,
-                student_outputs
-            )
-
-    strategy = CustomFeatureDistillation(
-        teacher_layer_name="dense_1",
-        student_layer_name="dense_1"
-    )
-
-    # Distill from specific layers with compatible shapes
+    # Distill from specific intermediate layers
     strategy = FeatureDistillation(
         loss="mse",
         teacher_layer_name="dense_1",
@@ -179,11 +102,14 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         student_layer_name="conv2d_1"
     )
 
-    # Distill from final outputs (equivalent to logits distillation)
+    # With custom loss instance
     strategy = FeatureDistillation(
-        loss="mse",
-        teacher_layer_name=None,  # Final output
-        student_layer_name=None   # Final output
+        loss=keras.losses.MeanAbsoluteError()
+    )
+
+    # For multi-output models
+    strategy = FeatureDistillation(
+        loss=["mse", "cosine_similarity"]
     )
     ```
     """
@@ -364,18 +290,9 @@ def _validate_layer_compatibility(self, teacher_outputs, student_outputs):
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute feature distillation loss using extracted features.
 
-        Note: This method expects the outputs to already be the extracted
-        features from the specified layers, not the final model outputs.
-        The Distiller class is responsible for extracting the features
-        using the methods provided by this strategy.
-
         Args:
-            teacher_outputs: Intermediate features from teacher model.
-                Can be a single tensor, list/tuple of tensors, or dict of
-                tensors.
-            student_outputs: Intermediate features from student model.
-                Can be a single tensor, list/tuple of tensors, or dict of
-                tensors.
+            teacher_outputs: Features from teacher model.
+            student_outputs: Features from student model.
             **kwargs: Additional arguments (ignored).
         Returns:
             Feature distillation loss tensor.
@@ -435,44 +352,19 @@ class LogitsDistillation(FeatureDistillation):
     computing the loss between teacher and student predictions. It's the most
     common approach for knowledge distillation.
 
-    How Logits Distillation Works:
-
-    1. Temperature Scaling: The teacher's logits are divided by a `temperature`
-       parameter (typically 3-5) before applying softmax. This creates "softer"
-       probability distributions that reveal relationships between classes.
-
-    2. Loss Computation: The loss is computed between the temperature-scaled
-       teacher logits and student logits using the specified loss function.
-
-    When to Use Logits Distillation:
-
-    - General Classification: Works well for most classification tasks
-    - Model Compression: Effective for reducing model size while maintaining
-      accuracy
-    - Transfer Learning: Good for leveraging knowledge from pre-trained models
-    - Ensemble Distillation: Can combine multiple teacher models
-
-    Temperature Guidelines:
-
-    - Low Temperature (1-2): Sharp distributions, similar to hard labels
-    - Medium Temperature (3-5): Balanced softness, most commonly used
-    - High Temperature (6-10): Very soft distributions, reveals subtle
-      relationships
-
     Args:
         temperature: Temperature for softmax scaling. Higher values produce
             softer probability distributions that are easier for the student to
             learn. Typical values range from 3-5. Defaults to 3.0.
-        loss: Loss function(s) to use for distillation. Can be:
+        loss: Loss function to use for distillation. Can be:
             - String identifier (e.g., 'kl_divergence',
               'categorical_crossentropy')
             - Keras loss instance
             - List/tuple of losses for multi-output models
             - Dict of losses for named outputs
-            The structure must match the model's output structure.
             Defaults to 'kl_divergence'.
 
-    Example:
+    Examples:
 
     ```python
     # Basic logits distillation with KL divergence
@@ -490,43 +382,11 @@ class LogitsDistillation(FeatureDistillation):
         loss=keras.losses.CategoricalCrossentropy(from_logits=True)
     )
 
-    # For multi-output models with list structure
+    # For multi-output models
     strategy = LogitsDistillation(
         temperature=3.0,
         loss=["kl_divergence", "categorical_crossentropy"]
     )
-
-    # For multi-output models with dict structure
-    strategy = LogitsDistillation(
-        temperature=3.0,
-        loss={
-            "classification": "kl_divergence",
-            "regression": "mse"
-        }
-    )
-
-    # Custom loss by subclassing
-    class CustomLogitsDistillation(LogitsDistillation):
-        def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
-            # Apply temperature scaling using tree.map_structure
-            teacher_scaled = tree.map_structure(
-                lambda x: x / self.temperature, teacher_outputs
-            )
-            student_scaled = tree.map_structure(
-                lambda x: x / self.temperature, student_outputs
-            )
-
-            # Custom loss computation
-            return tree.map_structure(
-                lambda t, s: keras.ops.mean(
-                    keras.losses.kl_divergence(
-                        keras.ops.softmax(t, axis=-1),
-                        keras.ops.softmax(s, axis=-1)
-                    )
-                ),
-                teacher_scaled,
-                student_scaled
-            )
     ```
     """
 
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index dfe28a28db55..6bf7e9a7930a 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -10,74 +10,23 @@ class Distiller(Model):
     """Distillation model for transferring knowledge from teacher to student.
 
     Knowledge distillation transfers knowledge from a large, complex model
-    (`teacher`) to a smaller, simpler model (`student`). The student learns
+    (teacher) to a smaller, simpler model (student). The student learns
     from both ground truth labels and the teacher's predictions, often
     achieving better performance than training on labels alone.
 
-    How Knowledge Distillation Works:
-
-    1. Teacher Model: A pre-trained, larger model that has learned complex
-       patterns and relationships in the data. The teacher is frozen during
-       distillation.
-
-    2. Student Model: A smaller, simpler model that we want to train to mimic
-       the teacher's behavior while being more efficient for deployment.
-
-    3. Distillation Process: The student learns from two sources:
-       - Hard targets: Traditional supervised learning with ground truth labels
-       - Soft targets: The teacher's predictions, which contain information
-         about class relationships and confidence levels
-
-    4. Temperature Scaling: The teacher's logits are divided by a `temperature`
-       parameter before applying softmax, creating "softer" probability
-       distributions that are easier for the student to learn from.
-
-    When to Use Knowledge Distillation:
-
-    - Model Compression: Reduce model size for deployment on
-      resource-constrained devices
-    - Performance Improvement: Student models often outperform models trained
-      only on labels
-    - Transfer Learning: Leverage knowledge from large pre-trained models
-    - Ensemble Distillation: Combine multiple teacher models into a single
-      student
-
-    Strategy Selection Guide:
-
-    - `LogitsDistillation`: Most common approach. Transfers final output
-      knowledge. Use for classification tasks where you want the student to
-      learn the teacher's decision boundaries and confidence patterns.
-
-    - `FeatureDistillation`: Transfers intermediate representations. Use when
-      teacher and student have similar architectures, as it helps the student
-      learn better internal representations. Often leads to better performance
-      than logits-only.
-
-    - Multiple Strategies: For models with multiple outputs (e.g., object
-      detection with classification and regression heads), pass a list of
-      strategies with corresponding weights. Each strategy will be applied to
-      its corresponding output.
-
-    - Custom Strategies: Create custom strategies by subclassing
-      `DistillationLoss` and overriding the `compute_loss` method.
-
     Args:
         teacher: A trained `keras.Model` that serves as the knowledge source.
             The teacher model is frozen during distillation.
-        student: A `keras.Model` to be trained through distillation. This model
-            will learn from both ground truth labels and the teacher's
-            predictions.
+        student: A `keras.Model` to be trained through distillation.
         strategy: Single distillation strategy to apply. Can be
             `LogitsDistillation`, `FeatureDistillation`, or a custom strategy.
             Use `strategies` for multiple strategies.
-        strategies: List of distillation strategies to apply. Each strategy will
-            be applied to its corresponding output. Use `strategy` for a single
-            strategy.
+        strategies: List of distillation strategies to apply. Use `strategy`
+            for a single strategy.
         strategy_weights: List of weights for each strategy. Must have the same
             length as `strategies`. If None, equal weights are used.
         student_loss_weight: Weight for the student's supervised loss component.
-            Must be between 0 and 1. Higher values emphasize ground truth
-            labels, lower values emphasize teacher predictions. Defaults to 0.5.
+            Must be between 0 and 1. Defaults to 0.5.
         optimizer: Optimizer for training the student model. Can be a string
             identifier (e.g., `'adam'`) or an optimizer instance.
         student_loss: Loss function for the student's supervised learning
@@ -87,56 +36,49 @@ class Distiller(Model):
         **kwargs: Additional keyword arguments passed to the parent `Model`
             class.
 
-    Example:
+        Examples:
 
     ```python
-    # Load pre-trained teacher model from KerasHub
+    # Basic distillation with KerasHub models
     import keras_hub as hub
 
-    teacher = hub.models.CausalLM.from_preset("gemma3_4b_en")
+    teacher = hub.models.CausalLM.from_preset("gemma_2b_en")
     student = hub.models.CausalLM.from_preset(
-        "gemma2_2b_en", load_weights=False
+        "gemma_1.1_2b_en", load_weights=False
     )
 
-    # Create distillation strategy
     strategy = LogitsDistillation(temperature=3.0)
 
-    # Create distiller
     distiller = Distiller(
         teacher=teacher,
         student=student,
         strategy=strategy,
-        student_loss_weight=0.7,
         optimizer='adam',
-        student_loss='sparse_categorical_crossentropy',
-        metrics=['accuracy']
+        student_loss='sparse_categorical_crossentropy'
     )
 
     # Train the distiller
-    distiller.fit(x_train, y_train, epochs=10, validation_split=0.2)
+    distiller.fit(x_train, y_train, epochs=10)
 
     # Access the trained student model
     trained_student = distiller.student_model
-    ```
 
-    For multi-output models:
-
-    ```python
-    # Create multiple strategies for different outputs
+    # Multiple strategies
     strategies = [
-        LogitsDistillation(temperature=3.0, output_index=0),
-        LogitsDistillation(temperature=2.0, output_index=1)
+        LogitsDistillation(temperature=3.0),
+        FeatureDistillation(
+            teacher_layer_name="dense_1",
+            student_layer_name="dense_1"
+        )
     ]
-    strategy_weights = [1.0, 0.5]  # Weight classification more heavily
 
     distiller = Distiller(
         teacher=teacher,
         student=student,
         strategies=strategies,
-        strategy_weights=strategy_weights,
-        student_loss_weight=0.5,
+        strategy_weights=[1.0, 0.5],
         optimizer='adam',
-        student_loss=['sparse_categorical_crossentropy', 'mse']
+        student_loss='sparse_categorical_crossentropy'
     )
     ```
     """
@@ -269,16 +211,7 @@ def convert_loss_to_function(loss):
         self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
 
     def _validate_models(self, teacher, student):
-        """Validate that teacher and student models are compatible for
-        distillation.
-
-        This method performs comprehensive validation including:
-        - Model type validation
-        - Input shape compatibility
-        - Output shape compatibility
-        - Architecture compatibility for feature distillation
-        - Data type compatibility
-        """
+        """Validate that teacher and student models are compatible."""
         # Basic model type validation
         if not isinstance(teacher, keras.Model):
             raise ValueError(
@@ -456,13 +389,7 @@ def _shapes_are_compatible(self, shape1, shape2):
         return True
 
     def _create_multi_feature_extractors(self):
-        """Create efficient feature extractors that extract all needed features
-        in single forward passes.
-
-        This method analyzes all FeatureDistillation strategies to determine
-        which layers need feature extraction, then creates models that extract
-        all required features in one pass to avoid redundant computation.
-        """
+        """Create feature extractors for efficient multi-layer extraction."""
         # Collect all layer names needed for feature extraction
         teacher_layer_names = []
         student_layer_names = []
@@ -566,13 +493,13 @@ def _create_multi_feature_extractors(self):
                 self._student_feature_extractor = None
 
     def _extract_all_teacher_features(self, x):
-        """Extract all teacher features efficiently in a single forward pass.
+        """Extract all teacher features in a single forward pass.
 
         Args:
             x: Input data.
 
         Returns:
-            Dict mapping layer names to their outputs, including 'final_output'.
+            Dict mapping layer names to their outputs.
         """
         if self._teacher_feature_extractor is not None:
             # Use efficient multi-output extractor
@@ -592,15 +519,14 @@ def _extract_all_teacher_features(self, x):
             return {"final_output": self.teacher(x, training=False)}
 
     def _extract_all_student_features(self, x, y_pred):
-        """Extract all student features efficiently in a single forward pass.
+        """Extract all student features in a single forward pass.
 
         Args:
             x: Input data.
-            y_pred: Student predictions from forward pass (to avoid
-                recomputation).
+            y_pred: Student predictions from forward pass.
 
         Returns:
-            Dict mapping layer names to their outputs, including 'final_output'.
+            Dict mapping layer names to their outputs.
         """
         if self._student_feature_extractor is not None:
             # Use efficient multi-output extractor
@@ -620,8 +546,7 @@ def _extract_all_student_features(self, x, y_pred):
             return {"final_output": y_pred}
 
     def _get_strategy_features(self, strategy, all_features, is_teacher):
-        """Get the specific features needed by a strategy from pre-extracted
-        features.
+        """Get the specific features needed by a strategy.
 
         Args:
             strategy: The FeatureDistillation strategy.
@@ -648,33 +573,8 @@ def _get_strategy_features(self, strategy, all_features, is_teacher):
     def student_model(self):
         """The trained student model for independent use.
 
-        This property provides access to the student model that has been trained
-        through the distillation process. The student model can be used
-        independently for inference, further training, or saving.
-
         Returns:
             keras.Model: The trained student model.
-
-        Example:
-            ```python
-            # After training the distiller
-            distiller.fit(x_train, y_train, epochs=10)
-
-            # Access the trained student model
-            trained_student = distiller.student_model
-
-            # Use the student model independently
-            predictions = trained_student.predict(x_test)
-
-            # Save the student model
-            trained_student.save('my_student_model.keras')
-
-            # Further train the student model
-            trained_student.compile(
-                optimizer='adam', loss='sparse_categorical_crossentropy'
-            )
-            trained_student.fit(x_new, y_new, epochs=5)
-            ```
         """
         return self.student
 
@@ -687,10 +587,6 @@ def compute_loss(
     ):
         """Compute combined distillation loss.
 
-        This method integrates distillation into Keras's standard training
-        workflow. Users can override this method to implement custom
-        distillation loss computation.
-
         Args:
             x: Input data.
             y: Target data.
@@ -700,39 +596,6 @@ def compute_loss(
 
         Returns:
             Combined loss tensor.
-
-        Example:
-            ```python
-            # Custom distillation loss by overriding compute_loss
-            class CustomDistiller(Distiller):
-                def compute_loss(self, x=None, y=None, y_pred=None,
-                               sample_weight=None, training=None):
-                    # Custom student loss computation
-                    student_loss = keras.losses.sparse_categorical_crossentropy(
-                        y, y_pred
-                    )
-
-                    # Custom distillation loss computation
-                    teacher_outputs = self.teacher(x, training=False)
-                    student_outputs = y_pred
-
-                    # Custom loss logic here
-                    distillation_loss = self._custom_distillation_loss(
-                        teacher_outputs, student_outputs
-                    )
-
-                    # Combine losses with custom weighting
-                    total_loss = 0.7 * student_loss + 0.3 * distillation_loss
-
-                    return total_loss
-
-                def _custom_distillation_loss(self, teacher_outputs,
-                                            student_outputs):
-                    # Implement custom distillation loss logic
-                    return keras.ops.mean(
-                        keras.ops.square(teacher_outputs - student_outputs)
-                    )
-            ```
         """
         # Compute student loss using tree operations for dicts, manual for lists
         student_loss = 0.0

From 565931057f9b866f87ee15120a68f8a41bb695cd Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Tue, 2 Sep 2025 18:35:36 -0700
Subject: [PATCH 24/31] add validation for feature extraction setup

---
 keras/src/distillation/distillation_loss.py |  6 +-
 keras/src/distillation/distiller.py         | 65 ++++++++++++++++++---
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index 02eaa6544cf9..8930b68117dc 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -291,8 +291,10 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute feature distillation loss using extracted features.
 
         Args:
-            teacher_outputs: Features from teacher model.
-            student_outputs: Features from student model.
+            teacher_outputs: Extracted features from the specified teacher
+                layer.
+            student_outputs: Extracted features from the specified student
+                layer.
             **kwargs: Additional arguments (ignored).
         Returns:
             Feature distillation loss tensor.
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 6bf7e9a7930a..3e568bf36947 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -197,6 +197,10 @@ def convert_loss_to_function(loss):
         # Create efficient multi-layer feature extractors
         self._create_multi_feature_extractors()
 
+        # Validate that feature extraction setup succeeded for
+        # FeatureDistillation strategies
+        self._validate_feature_extraction_setup()
+
         # Freeze teacher model
         self.teacher.trainable = False
 
@@ -492,6 +496,38 @@ def _create_multi_feature_extractors(self):
                 # Fallback to individual extraction for subclassed models
                 self._student_feature_extractor = None
 
+    def _validate_feature_extraction_setup(self):
+        """Validate that feature extraction setup succeeded for
+        FeatureDistillation strategies."""
+        for strategy in self.strategies:
+            # Check if strategy has layer names (indicates FeatureDistillation)
+            if (
+                hasattr(strategy, "teacher_layer_name")
+                and strategy.teacher_layer_name is not None
+            ):
+                if self._teacher_feature_extractor is None:
+                    raise RuntimeError(
+                        f"FeatureDistillation strategy targeting teacher layer "
+                        f"'{strategy.teacher_layer_name}' failed to create "
+                        f"feature extractor. This can happen with subclassed "
+                        f"models or models that haven't been built. Consider "
+                        f"using LogitsDistillation instead, or ensure your "
+                        f"models are built by calling them with sample input."
+                    )
+            if (
+                hasattr(strategy, "student_layer_name")
+                and strategy.student_layer_name is not None
+            ):
+                if self._student_feature_extractor is None:
+                    raise RuntimeError(
+                        f"FeatureDistillation strategy targeting student layer "
+                        f"'{strategy.student_layer_name}' failed to create "
+                        f"feature extractor. This can happen with subclassed "
+                        f"models or models that haven't been built. Consider "
+                        f"using LogitsDistillation instead, or ensure your "
+                        f"models are built by calling them with sample input."
+                    )
+
     def _extract_all_teacher_features(self, x):
         """Extract all teacher features in a single forward pass.
 
@@ -583,7 +619,7 @@ def call(self, inputs, training=None, **kwargs):
         return self.student(inputs, training=training, **kwargs)
 
     def compute_loss(
-        self, x=None, y=None, y_pred=None, sample_weight=None, training=None
+        self, x=None, y=None, y_pred=None, sample_weight=None, training=True
     ):
         """Compute combined distillation loss.
 
@@ -657,12 +693,27 @@ def compute_loss(
                 # Get appropriate outputs/features for this strategy
                 if hasattr(strategy, "teacher_layer_name"):
                     # FeatureDistillation - use extracted features
-                    strategy_teacher_output = self._get_strategy_features(
-                        strategy, teacher_features, is_teacher=True
-                    )
-                    strategy_student_output = self._get_strategy_features(
-                        strategy, student_features, is_teacher=False
-                    )
+                    try:
+                        strategy_teacher_output = self._get_strategy_features(
+                            strategy, teacher_features, is_teacher=True
+                        )
+                        strategy_student_output = self._get_strategy_features(
+                            strategy, student_features, is_teacher=False
+                        )
+                    except ValueError as e:
+                        # Provide more helpful error message for feature
+                        # extraction failures
+                        raise RuntimeError(
+                            f"FeatureDistillation failed for strategy "
+                            f"targeting teacher layer "
+                            f"'{strategy.teacher_layer_name}' and student "
+                            f"layer '{strategy.student_layer_name}'. This can "
+                            f"happen "
+                            f"with subclassed models or models that haven't "
+                            f"been built properly. Consider using only "
+                            f"LogitsDistillation for such models. "
+                            f"Original error: {e}"
+                        ) from e
                 else:
                     # LogitsDistillation - use final model outputs
                     strategy_teacher_output = teacher_features["final_output"]

From d3b27b3f13d5a4e119dcfad0bc4ad3b9b4f6d3fc Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Thu, 4 Sep 2025 14:01:55 -0700
Subject: [PATCH 25/31] Fix distiller API to accept single strategy or list of
 strategies

---
 .../distillation/distillation_loss_test.py    |  4 +-
 keras/src/distillation/distiller.py           | 75 +++++++------------
 keras/src/distillation/distiller_test.py      | 20 ++---
 3 files changed, 38 insertions(+), 61 deletions(-)

diff --git a/keras/src/distillation/distillation_loss_test.py b/keras/src/distillation/distillation_loss_test.py
index 969034e53741..b2b3c3322c1a 100644
--- a/keras/src/distillation/distillation_loss_test.py
+++ b/keras/src/distillation/distillation_loss_test.py
@@ -98,7 +98,7 @@ def test_logits_distillation_end_to_end(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=LogitsDistillation(temperature=3.0),
+            strategies=LogitsDistillation(temperature=3.0),
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
             student_loss="sparse_categorical_crossentropy",
@@ -167,7 +167,7 @@ def test_feature_distillation_end_to_end(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=FeatureDistillation(
+            strategies=FeatureDistillation(
                 loss="mse",
                 teacher_layer_name="teacher_dense_1",
                 student_layer_name="student_dense_1",
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 3e568bf36947..e0d2a053340f 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -18,13 +18,11 @@ class Distiller(Model):
         teacher: A trained `keras.Model` that serves as the knowledge source.
             The teacher model is frozen during distillation.
         student: A `keras.Model` to be trained through distillation.
-        strategy: Single distillation strategy to apply. Can be
-            `LogitsDistillation`, `FeatureDistillation`, or a custom strategy.
-            Use `strategies` for multiple strategies.
-        strategies: List of distillation strategies to apply. Use `strategy`
-            for a single strategy.
-        strategy_weights: List of weights for each strategy. Must have the same
-            length as `strategies`. If None, equal weights are used.
+        strategies: List of distillation strategies to apply. Can be a single
+            strategy or a list of strategies like `LogitsDistillation`,
+            `FeatureDistillation`, or custom distillation strategies.
+        strategy_weights: List of weights for each distillation strategy. Must
+            have the same length as `strategies`. If None, equal weights used.
         student_loss_weight: Weight for the student's supervised loss component.
             Must be between 0 and 1. Defaults to 0.5.
         optimizer: Optimizer for training the student model. Can be a string
@@ -36,7 +34,7 @@ class Distiller(Model):
         **kwargs: Additional keyword arguments passed to the parent `Model`
             class.
 
-        Examples:
+    Examples:
 
     ```python
     # Basic distillation with KerasHub models
@@ -47,12 +45,11 @@ class Distiller(Model):
         "gemma_1.1_2b_en", load_weights=False
     )
 
-    strategy = LogitsDistillation(temperature=3.0)
-
+    # Single distillation strategy
     distiller = Distiller(
         teacher=teacher,
         student=student,
-        strategy=strategy,
+        strategies=LogitsDistillation(temperature=3.0),
         optimizer='adam',
         student_loss='sparse_categorical_crossentropy'
     )
@@ -63,19 +60,17 @@ class Distiller(Model):
     # Access the trained student model
     trained_student = distiller.student_model
 
-    # Multiple strategies
-    strategies = [
-        LogitsDistillation(temperature=3.0),
-        FeatureDistillation(
-            teacher_layer_name="dense_1",
-            student_layer_name="dense_1"
-        )
-    ]
-
+    # Multiple distillation strategies
     distiller = Distiller(
         teacher=teacher,
         student=student,
-        strategies=strategies,
+        strategies=[
+            LogitsDistillation(temperature=3.0),
+            FeatureDistillation(
+                teacher_layer_name="dense_1",
+                student_layer_name="dense_1"
+            )
+        ],
         strategy_weights=[1.0, 0.5],
         optimizer='adam',
         student_loss='sparse_categorical_crossentropy'
@@ -87,8 +82,7 @@ def __init__(
         self,
         teacher,
         student,
-        strategy=None,
-        strategies=None,
+        strategies,
         strategy_weights=None,
         student_loss_weight=0.5,
         optimizer="adam",
@@ -148,29 +142,20 @@ def convert_loss_to_function(loss):
             convert_loss_to_function, student_loss
         )
 
-        # Handle strategy configuration
-        if strategy is not None and strategies is not None:
+        # Handle strategies configuration
+        if strategies is None:
             raise ValueError(
-                "Cannot specify both 'strategy' and 'strategies'. "
-                "Use 'strategy' for single strategy or 'strategies' for "
-                "multiple strategies."
+                "Must specify 'strategies'. "
+                "Please provide a valid distillation strategy such as "
+                "LogitsDistillation, FeatureDistillation, or a list."
             )
 
-        if strategy is not None:
-            # Single strategy mode
-            self.strategies = [strategy]
+        # Convert single strategy to list for uniform handling
+        if not isinstance(strategies, (list, tuple)):
+            self.strategies = [strategies]
             self.strategy_weights = [1.0]
-            self.single_strategy = True
-        elif strategies is not None:
-            # Multiple strategies mode
-            if not isinstance(strategies, (list, tuple)):
-                raise ValueError(
-                    f"strategies must be a list or tuple, got "
-                    f"{type(strategies)}"
-                )
-
+        else:
             self.strategies = strategies
-
             # Set default weights if not provided
             if strategy_weights is None:
                 self.strategy_weights = [1.0] * len(strategies)
@@ -182,14 +167,6 @@ def convert_loss_to_function(loss):
                     )
                 self.strategy_weights = strategy_weights
 
-            self.single_strategy = False
-        else:
-            raise ValueError(
-                "Must specify either 'strategy' or 'strategies'. "
-                "Please provide a valid strategy such as LogitsDistillation, "
-                "FeatureDistillation, or a list of strategies."
-            )
-
         # Validate strategy-specific compatibility
         for strategy in self.strategies:
             self._validate_strategy_compatibility(teacher, student, strategy)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 7683748ab8f7..bfdafc836392 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -56,7 +56,7 @@ def setUp(self):
         self.distiller = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=0.5,
             optimizer="adam",
             student_loss="sparse_categorical_crossentropy",
@@ -125,7 +125,7 @@ def test_teacher_freezing(self):
         Distiller(
             teacher=new_teacher,
             student=self.student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(),
             student_loss="sparse_categorical_crossentropy",
@@ -141,14 +141,14 @@ def test_model_compatibility_validation(self):
             Distiller(
                 teacher="not_a_model",
                 student=self.student,
-                strategy=self.strategy,
+                strategies=self.strategy,
             )
 
         with self.assertRaises(ValueError):
             Distiller(
                 teacher=self.teacher,
                 student="not_a_model",
-                strategy=self.strategy,
+                strategies=self.strategy,
             )
 
     def test_multi_strategy_functionality(self):
@@ -221,7 +221,7 @@ def test_student_loss_weighting(self):
         distiller_0 = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=0.0,
             optimizer=keras.optimizers.Adam(),
             student_loss="sparse_categorical_crossentropy",
@@ -231,7 +231,7 @@ def test_student_loss_weighting(self):
         distiller_1 = Distiller(
             teacher=self.teacher,
             student=self.student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=1.0,
             optimizer=keras.optimizers.Adam(),
             student_loss="sparse_categorical_crossentropy",
@@ -266,7 +266,7 @@ def test_full_training_workflow(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
             student_loss="sparse_categorical_crossentropy",
@@ -332,7 +332,7 @@ def test_evaluation_workflow(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
             student_loss="sparse_categorical_crossentropy",
@@ -366,7 +366,7 @@ def test_prediction_workflow(self):
         distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=self.strategy,
+            strategies=self.strategy,
             student_loss_weight=0.5,
             optimizer=keras.optimizers.Adam(learning_rate=0.01),
             student_loss="sparse_categorical_crossentropy",
@@ -419,7 +419,7 @@ def test_distiller_serialization_and_saving(self):
         original_distiller = Distiller(
             teacher=teacher,
             student=student,
-            strategy=strategy,
+            strategies=strategy,
             student_loss_weight=0.7,
             optimizer=keras.optimizers.Adam(),
             student_loss="sparse_categorical_crossentropy",

From bbe08685c574388b8161776b8140f289d16d4437 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Thu, 4 Sep 2025 14:22:16 -0700
Subject: [PATCH 26/31] minor fixes

---
 keras/src/distillation/distillation_loss.py |   9 -
 keras/src/distillation/distiller.py         | 259 ++++++++++----------
 2 files changed, 134 insertions(+), 134 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index 8930b68117dc..d2c1bcbf4f99 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -222,7 +222,6 @@ def _create_feature_extractor(self, model, layer_name):
             )
 
         # Create a new model that extracts features from the specified layer.
-        # This approach is robust for models created with the Functional API.
         try:
             return keras.Model(
                 inputs=model.inputs,
@@ -231,7 +230,6 @@ def _create_feature_extractor(self, model, layer_name):
             )
         except (ValueError, AttributeError) as e:
             # Handle the case where the model doesn't have defined inputs yet
-            # (common with Sequential models that haven't been built)
             error_msg = str(e).lower()
             if (
                 "no defined inputs" in error_msg
@@ -277,7 +275,6 @@ def validate_outputs(self, teacher_outputs, student_outputs):
         ):
             # Validate that the specified layers exist and are compatible
             self._validate_layer_compatibility(teacher_outputs, student_outputs)
-        # Note: Base class already validated output count compatibility
 
     def _validate_layer_compatibility(self, teacher_outputs, student_outputs):
         """Validate that the specified layers are compatible for feature
@@ -337,12 +334,6 @@ def from_config(cls, config):
         """Create instance from configuration."""
         config = config.copy()
         config["loss"] = keras.losses.deserialize(config["loss"])
-
-        # Filter out parameters that LogitsDistillation doesn't accept
-        # (inherited from FeatureDistillation's get_config)
-        config.pop("teacher_layer_name", None)
-        config.pop("student_layer_name", None)
-
         return cls(**config)
 
 
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index e0d2a053340f..0a558973e7c2 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -167,16 +167,14 @@ def convert_loss_to_function(loss):
                     )
                 self.strategy_weights = strategy_weights
 
-        # Validate strategy-specific compatibility
+        # Validate strategy-specific compatibility and create feature extractors
         for strategy in self.strategies:
             self._validate_strategy_compatibility(teacher, student, strategy)
 
-        # Create efficient multi-layer feature extractors
         self._create_multi_feature_extractors()
 
-        # Validate that feature extraction setup succeeded for
-        # FeatureDistillation strategies
-        self._validate_feature_extraction_setup()
+        # Initialize the model - compile with provided parameters
+        self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
 
         # Freeze teacher model
         self.teacher.trainable = False
@@ -188,9 +186,6 @@ def convert_loss_to_function(loss):
         )
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
-        # Compile the model with provided parameters
-        self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
-
     def _validate_models(self, teacher, student):
         """Validate that teacher and student models are compatible."""
         # Basic model type validation
@@ -392,118 +387,66 @@ def _create_multi_feature_extractors(self):
         # Create multi-output feature extractors if needed
         self._teacher_feature_extractor = None
         self._student_feature_extractor = None
-        self._teacher_layer_outputs = {}
-        self._student_layer_outputs = {}
 
         if teacher_layer_names:
-            try:
-                # For Sequential models, use the last layer's output as final
-                if isinstance(self.teacher, keras.Sequential):
-                    final_output = self.teacher.layers[-1].output
-                    inputs = self.teacher.layers[0].input
-                else:
-                    # For Functional models
-                    if (
-                        not hasattr(self.teacher, "inputs")
-                        or self.teacher.inputs is None
-                    ):
-                        raise ValueError("Teacher model has no defined inputs")
-                    if (
-                        not hasattr(self.teacher, "output")
-                        or self.teacher.output is None
-                    ):
-                        raise ValueError("Teacher model has no defined output")
-                    final_output = self.teacher.output
-                    inputs = self.teacher.inputs
-
-                teacher_outputs = [final_output]  # Always include final output
-                teacher_output_names = ["final_output"]
-
-                for layer_name in teacher_layer_names:
-                    layer = self.teacher.get_layer(name=layer_name)
-                    teacher_outputs.append(layer.output)
-                    teacher_output_names.append(layer_name)
-
-                self._teacher_feature_extractor = keras.Model(
-                    inputs=inputs,
-                    outputs=teacher_outputs,
-                    name=f"{self.teacher.name}_multi_feature_extractor",
-                )
-                self._teacher_output_names = teacher_output_names
-            except (ValueError, AttributeError):
-                # Fallback to individual extraction for subclassed models
-                self._teacher_feature_extractor = None
+            self._create_feature_extractor(
+                self.teacher, teacher_layer_names, "teacher"
+            )
 
         if student_layer_names:
-            try:
-                # For Sequential models, use the last layer's output as final
-                if isinstance(self.student, keras.Sequential):
-                    final_output = self.student.layers[-1].output
-                    inputs = self.student.layers[0].input
-                else:
-                    # For Functional models
-                    if (
-                        not hasattr(self.student, "inputs")
-                        or self.student.inputs is None
-                    ):
-                        raise ValueError("Student model has no defined inputs")
-                    if (
-                        not hasattr(self.student, "output")
-                        or self.student.output is None
-                    ):
-                        raise ValueError("Student model has no defined output")
-                    final_output = self.student.output
-                    inputs = self.student.inputs
-
-                student_outputs = [final_output]  # Always include final output
-                student_output_names = ["final_output"]
-
-                for layer_name in student_layer_names:
-                    layer = self.student.get_layer(name=layer_name)
-                    student_outputs.append(layer.output)
-                    student_output_names.append(layer_name)
-
-                self._student_feature_extractor = keras.Model(
-                    inputs=inputs,
-                    outputs=student_outputs,
-                    name=f"{self.student.name}_multi_feature_extractor",
-                )
-                self._student_output_names = student_output_names
-            except (ValueError, AttributeError):
-                # Fallback to individual extraction for subclassed models
-                self._student_feature_extractor = None
+            self._create_feature_extractor(
+                self.student, student_layer_names, "student"
+            )
 
-    def _validate_feature_extraction_setup(self):
-        """Validate that feature extraction setup succeeded for
-        FeatureDistillation strategies."""
-        for strategy in self.strategies:
-            # Check if strategy has layer names (indicates FeatureDistillation)
-            if (
-                hasattr(strategy, "teacher_layer_name")
-                and strategy.teacher_layer_name is not None
-            ):
-                if self._teacher_feature_extractor is None:
-                    raise RuntimeError(
-                        f"FeatureDistillation strategy targeting teacher layer "
-                        f"'{strategy.teacher_layer_name}' failed to create "
-                        f"feature extractor. This can happen with subclassed "
-                        f"models or models that haven't been built. Consider "
-                        f"using LogitsDistillation instead, or ensure your "
-                        f"models are built by calling them with sample input."
+    def _create_feature_extractor(self, model, layer_names, model_type):
+        """Create feature extractor for a model."""
+        try:
+            # Get model inputs and final output
+            if isinstance(model, keras.Sequential):
+                final_output = model.layers[-1].output
+                inputs = model.layers[0].input
+            else:
+                if not hasattr(model, "inputs") or model.inputs is None:
+                    raise ValueError(
+                        f"{model_type} model has no defined inputs"
                     )
-            if (
-                hasattr(strategy, "student_layer_name")
-                and strategy.student_layer_name is not None
-            ):
-                if self._student_feature_extractor is None:
-                    raise RuntimeError(
-                        f"FeatureDistillation strategy targeting student layer "
-                        f"'{strategy.student_layer_name}' failed to create "
-                        f"feature extractor. This can happen with subclassed "
-                        f"models or models that haven't been built. Consider "
-                        f"using LogitsDistillation instead, or ensure your "
-                        f"models are built by calling them with sample input."
+                if not hasattr(model, "output") or model.output is None:
+                    raise ValueError(
+                        f"{model_type} model has no defined output"
                     )
+                final_output = model.output
+                inputs = model.inputs
+
+            # Collect outputs
+            outputs = [final_output]  # Always include final output
+            output_names = ["final_output"]
+
+            for layer_name in layer_names:
+                layer = model.get_layer(name=layer_name)
+                outputs.append(layer.output)
+                output_names.append(layer_name)
+
+            # Create extractor
+            extractor = keras.Model(
+                inputs=inputs,
+                outputs=outputs,
+                name=f"{model.name}_multi_feature_extractor",
+            )
+
+            # Store based on model type
+            if model_type == "teacher":
+                self._teacher_feature_extractor = extractor
+                self._teacher_output_names = output_names
+            else:
+                self._student_feature_extractor = extractor
+                self._student_output_names = output_names
+
+        except (ValueError, AttributeError):
+            # Fallback for subclassed models
+            if model_type == "teacher":
+                self._teacher_feature_extractor = None
+            else:
+                self._student_feature_extractor = None
 
     def _extract_all_teacher_features(self, x):
         """Extract all teacher features in a single forward pass.
@@ -582,6 +525,26 @@ def _get_strategy_features(self, strategy, all_features, is_teacher):
 
         return all_features[layer_name]
 
+    def compile(self, optimizer="adam", loss=None, metrics=None, **kwargs):
+        """Compile the distiller with proper integration.
+
+        Args:
+            optimizer: Optimizer for training the student model.
+            loss: Student loss function (stored for serialization).
+            metrics: Additional metrics to track during training.
+            **kwargs: Additional arguments passed to parent compile.
+        """
+        # Store the student loss for serialization (not used in training)
+        self._student_loss_for_serialization = loss
+
+        # Compile with a dummy loss since we override compute_loss
+        super().compile(
+            optimizer=optimizer,
+            loss=None,  # We handle loss in compute_loss
+            metrics=metrics,
+            **kwargs,
+        )
+
     @property
     def student_model(self):
         """The trained student model for independent use.
@@ -604,17 +567,20 @@ def compute_loss(
             x: Input data.
             y: Target data.
             y_pred: Model predictions.
-            sample_weight: Sample weights.
+            sample_weight: Sample weights (currently unused).
             training: Whether the model is in training mode.
 
         Returns:
             Combined loss tensor.
         """
+        # Handle case where y_pred is not provided
+        if y_pred is None:
+            y_pred = self(x, training=training)
         # Compute student loss using tree operations for dicts, manual for lists
         student_loss = 0.0
         if self.student_loss_weight > 0.0 and y is not None:
             if isinstance(self._student_loss, dict):
-                # Dict case - check keys match at runtime (keys can change)
+                # Dict case - check keys match at runtime
                 loss_keys = set(self._student_loss.keys())
                 y_keys = set(y.keys())
                 pred_keys = set(y_pred.keys())
@@ -625,7 +591,6 @@ def compute_loss(
                         f"Target keys: {y_keys}, Prediction keys: {pred_keys}"
                     )
 
-                # Compute losses manually and sum using tree.flatten
                 loss_values = {
                     key: self._student_loss[key](y[key], y_pred[key])
                     for key in self._student_loss.keys()
@@ -641,7 +606,6 @@ def compute_loss(
                         f"({len(self._student_loss)}) must match."
                     )
 
-                # Compute losses manually and sum using tree.flatten
                 loss_values = [
                     loss_fn(y_true, y_pred_i)
                     for loss_fn, y_true, y_pred_i in zip(
@@ -661,15 +625,17 @@ def compute_loss(
         # Compute distillation loss
         distillation_loss = 0.0
         if self.student_loss_weight < 1.0:
-            # Extract all features efficiently in single forward passes
             teacher_features = self._extract_all_teacher_features(x)
             student_features = self._extract_all_student_features(x, y_pred)
 
             # Apply strategies using pre-extracted features
             for strategy, weight in zip(self.strategies, self.strategy_weights):
                 # Get appropriate outputs/features for this strategy
-                if hasattr(strategy, "teacher_layer_name"):
-                    # FeatureDistillation - use extracted features
+                if (
+                    hasattr(strategy, "teacher_layer_name")
+                    and strategy.teacher_layer_name is not None
+                ):
+                    # FeatureDistillation with specific layers
                     try:
                         strategy_teacher_output = self._get_strategy_features(
                             strategy, teacher_features, is_teacher=True
@@ -685,14 +651,13 @@ def compute_loss(
                             f"targeting teacher layer "
                             f"'{strategy.teacher_layer_name}' and student "
                             f"layer '{strategy.student_layer_name}'. This can "
-                            f"happen "
-                            f"with subclassed models or models that haven't "
+                            f"happen with subclassed models that haven't "
                             f"been built properly. Consider using only "
                             f"LogitsDistillation for such models. "
                             f"Original error: {e}"
                         ) from e
                 else:
-                    # LogitsDistillation - use final model outputs
+                    # LogitsDistillation or FeatureDistillation (final outputs)
                     strategy_teacher_output = teacher_features["final_output"]
                     strategy_student_output = y_pred
 
@@ -734,15 +699,30 @@ def reset_metrics(self):
     @property
     def metrics(self):
         """Return list of metrics."""
-        return [
+        # Get parent metrics (from compile)
+        parent_metrics = []
+        if hasattr(super(), "metrics"):
+            parent_metrics = [
+                m
+                for m in super().metrics
+                if m
+                not in [
+                    self.total_loss_tracker,
+                    self.student_loss_tracker,
+                    self.distillation_loss_tracker,
+                ]
+            ]
+
+        # Add our custom loss trackers first
+        distillation_metrics = [
             self.total_loss_tracker,
             self.student_loss_tracker,
             self.distillation_loss_tracker,
         ]
+        return distillation_metrics + parent_metrics
 
     def get_config(self):
         """Get configuration for serialization."""
-
         config = super().get_config()
         config.update(
             {
@@ -758,6 +738,18 @@ def get_config(self):
                 ],
                 "strategy_weights": self.strategy_weights,
                 "student_loss_weight": self.student_loss_weight,
+                # Save current state, not initial parameters
+                "optimizer": serialization_lib.serialize_keras_object(
+                    self.optimizer
+                )
+                if hasattr(self, "optimizer") and self.optimizer
+                else None,
+                "student_loss": serialization_lib.serialize_keras_object(
+                    getattr(self, "_student_loss_for_serialization", None)
+                ),
+                # Note: metrics are not easily serializable due to
+                # CompileMetrics complexity, so we skip them in serialization
+                "metrics": None,
             }
         )
         return config
@@ -765,7 +757,9 @@ def get_config(self):
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
+        config = config.copy()
 
+        # Deserialize objects
         config["teacher"] = serialization_lib.deserialize_keras_object(
             config["teacher"]
         )
@@ -776,4 +770,19 @@ def from_config(cls, config):
             serialization_lib.deserialize_keras_object(strategy)
             for strategy in config["strategies"]
         ]
+
+        # Handle optional parameters
+        if "optimizer" in config and config["optimizer"] is not None:
+            config["optimizer"] = serialization_lib.deserialize_keras_object(
+                config["optimizer"]
+            )
+        if "student_loss" in config and config["student_loss"] is not None:
+            config["student_loss"] = serialization_lib.deserialize_keras_object(
+                config["student_loss"]
+            )
+        if "metrics" in config and config["metrics"] is not None:
+            config["metrics"] = serialization_lib.deserialize_keras_object(
+                config["metrics"]
+            )
+
         return cls(**config)

From 77750795115713f7535ab49ef9fed8d7b9469e22 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Thu, 4 Sep 2025 17:09:36 -0700
Subject: [PATCH 27/31] fix tests

---
 .../distillation/distillation_loss_test.py    | 13 +++++---
 keras/src/distillation/distiller.py           |  6 ++--
 keras/src/distillation/distiller_test.py      | 33 +++++++++++++++----
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/keras/src/distillation/distillation_loss_test.py b/keras/src/distillation/distillation_loss_test.py
index b2b3c3322c1a..38af50fee619 100644
--- a/keras/src/distillation/distillation_loss_test.py
+++ b/keras/src/distillation/distillation_loss_test.py
@@ -94,6 +94,15 @@ def test_logits_distillation_end_to_end(self):
             ]
         )
 
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
+
+        # Build models to avoid JAX tracer issues
+        dummy_input = x[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
         # Create distiller
         distiller = Distiller(
             teacher=teacher,
@@ -105,10 +114,6 @@ def test_logits_distillation_end_to_end(self):
             metrics=["accuracy"],
         )
 
-        # Create test data
-        x = np.random.random((32, 20)).astype(np.float32)
-        y = np.random.randint(0, 10, (32,)).astype(np.int32)
-
         # Test training
         history = distiller.fit(x, y, epochs=2, verbose=0)
 
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 0a558973e7c2..4add4fe781e1 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -173,9 +173,6 @@ def convert_loss_to_function(loss):
 
         self._create_multi_feature_extractors()
 
-        # Initialize the model - compile with provided parameters
-        self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
-
         # Freeze teacher model
         self.teacher.trainable = False
 
@@ -186,6 +183,9 @@ def convert_loss_to_function(loss):
         )
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
+        # Initialize the model - compile with provided parameters
+        self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
+
     def _validate_models(self, teacher, student):
         """Validate that teacher and student models are compatible."""
         # Basic model type validation
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index bfdafc836392..4fd50880f60a 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -45,11 +45,20 @@ def setUp(self):
         """Set up test fixtures."""
         super().setUp()
 
+        # Create test data
+        self.x = np.random.random((20, 5)).astype(np.float32)
+        self.y = np.random.randint(0, 10, (20,)).astype(np.int32)
+
         # Create teacher and student models
         self.teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         self.student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
-        # Create distillation strategy with explicit temperature
+        # Build models
+        dummy_input = self.x[:2]
+        self.teacher(dummy_input)
+        self.student(dummy_input)
+
+        # Create distillation strategy
         self.strategy = LogitsDistillation(temperature=2.0)
 
         # Create distiller
@@ -63,10 +72,6 @@ def setUp(self):
             metrics=["accuracy"],
         )
 
-        # Create test data
-        self.x = np.random.random((20, 5)).astype(np.float32)
-        self.y = np.random.randint(0, 10, (20,)).astype(np.int32)
-
     def test_distiller_initialization(self):
         """Test Distiller initialization."""
         # Check that teacher is frozen
@@ -176,7 +181,7 @@ def test_multi_strategy_functionality(self):
         self.assertEqual(distiller.strategy_weights, [0.7, 0.3])
 
         # Test training
-        x = np.random.random((10, 8)).astype(np.float32)
+        x = np.random.random((10, 5)).astype(np.float32)
         y = np.random.randint(0, 10, (10,))
         history = distiller.fit(x, y, epochs=1, verbose=0)
 
@@ -217,6 +222,7 @@ def test_multi_strategy_validation(self):
             )
 
     def test_student_loss_weighting(self):
+        """Test student loss weighting functionality."""
         # Test with student_loss_weight = 0.0 (only distillation loss)
         distiller_0 = Distiller(
             teacher=self.teacher,
@@ -262,6 +268,11 @@ def test_full_training_workflow(self):
         teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
+        # Build models to avoid JAX tracer issues
+        dummy_input = x_train[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
         # Create distiller
         distiller = Distiller(
             teacher=teacher,
@@ -328,6 +339,11 @@ def test_evaluation_workflow(self):
         teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
+        # Build models to avoid JAX tracer issues
+        dummy_input = x_test[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
         # Create distiller
         distiller = Distiller(
             teacher=teacher,
@@ -362,6 +378,11 @@ def test_prediction_workflow(self):
         teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
         student = SimpleStudent(vocab_size=10, hidden_dim=16)
 
+        # Build models to avoid JAX tracer issues
+        dummy_input = x_test[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
         # Create distiller
         distiller = Distiller(
             teacher=teacher,

From a078fb47ecfd18cc8ae3842e91fcc74959c833b3 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 8 Sep 2025 17:03:37 -0700
Subject: [PATCH 28/31] address reveiw comments

---
 keras/src/distillation/distillation_loss.py   | 105 ++--
 .../distillation/distillation_loss_test.py    |  54 +-
 keras/src/distillation/distiller.py           | 486 +++++++-----------
 keras/src/distillation/distiller_test.py      |  57 +-
 4 files changed, 280 insertions(+), 422 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index d2c1bcbf4f99..c18ead423695 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -1,6 +1,7 @@
 import keras
 from keras.src import tree
 from keras.src.api_export import keras_export
+from keras.src.saving import serialization_lib
 
 
 @keras_export("keras.distillation.DistillationLoss")
@@ -60,6 +61,22 @@ def validate_outputs(self, teacher_outputs, student_outputs):
                 f"student has {len(student_outputs)} outputs."
             )
 
+    def validate_model_compatibility(self, teacher, student):
+        """Validate that teacher and student models are compatible.
+
+        This method ensures that the teacher and student models are compatible
+        for the specific distillation strategy. It should check model structure,
+        layer availability, and other strategy-specific requirements.
+
+        Args:
+            teacher: The teacher model.
+            student: The student model.
+        Raises:
+            ValueError: If models are not compatible with this strategy.
+        """
+        # can be overridden by subclasses
+        pass
+
 
 @keras_export("keras.distillation.FeatureDistillation")
 class FeatureDistillation(DistillationLoss):
@@ -74,8 +91,7 @@ class FeatureDistillation(DistillationLoss):
         loss: Loss function to use for feature distillation. Can be:
             - String identifier (e.g., 'mse', 'cosine_similarity', 'mae')
             - Keras loss instance
-            - List/tuple of losses for multi-output models
-            - Dict of losses for named outputs
+            - Nested structure of losses matching the layer output structure
             Defaults to 'mse'.
         teacher_layer_name: Name of the teacher layer to extract features from.
             If None, uses the final output. Defaults to None.
@@ -120,10 +136,6 @@ def __init__(
         self.teacher_layer_name = teacher_layer_name
         self.student_layer_name = student_layer_name
 
-        # Feature extraction models (created when needed)
-        self._teacher_feature_model = None
-        self._student_feature_model = None
-
         # Convert loss structure to functions using tree.map_structure
         def convert_loss_to_function(loss_item):
             if isinstance(loss_item, str):
@@ -139,49 +151,6 @@ def convert_loss_to_function(loss_item):
 
         self.loss = tree.map_structure(convert_loss_to_function, loss)
 
-    def _get_features(
-        self, model, inputs, training, layer_name, feature_model_attr
-    ):
-        """Extract features from model at specified layer.
-
-        Args:
-            model: The model to extract features from.
-            inputs: Input data.
-            training: Whether model is in training mode.
-            layer_name: Name of layer to extract from (None for final output).
-            feature_model_attr: Attribute name to cache feature extraction
-                model.
-
-        Returns:
-            Extracted features.
-        """
-        # No specific layer, use the final model output
-        if layer_name is None:
-            return model(inputs, training=training)
-
-        # For intermediate layer extraction, create feature extractor if needed
-        if getattr(self, feature_model_attr) is None:
-            try:
-                setattr(
-                    self,
-                    feature_model_attr,
-                    self._create_feature_extractor(model, layer_name),
-                )
-            except ValueError as e:
-                if "no defined inputs" in str(e).lower():
-                    # Build the model by calling it with inputs first
-                    _ = model(inputs, training=training)
-                    # Now try again
-                    setattr(
-                        self,
-                        feature_model_attr,
-                        self._create_feature_extractor(model, layer_name),
-                    )
-                else:
-                    raise
-
-        return getattr(self, feature_model_attr)(inputs, training=training)
-
     def validate_model_compatibility(self, teacher, student):
         """Validate that teacher and student models are compatible for feature
         distillation."""
@@ -338,7 +307,7 @@ def from_config(cls, config):
 
 
 @keras_export("keras.distillation.LogitsDistillation")
-class LogitsDistillation(FeatureDistillation):
+class LogitsDistillation(DistillationLoss):
     """Distillation strategy that transfers knowledge from final model outputs.
 
     This strategy applies temperature scaling to the teacher's logits before
@@ -353,8 +322,7 @@ class LogitsDistillation(FeatureDistillation):
             - String identifier (e.g., 'kl_divergence',
               'categorical_crossentropy')
             - Keras loss instance
-            - List/tuple of losses for multi-output models
-            - Dict of losses for named outputs
+            - Nested structure of losses matching the model output structure
             Defaults to 'kl_divergence'.
 
     Examples:
@@ -388,12 +356,20 @@ def __init__(
         temperature=3.0,
         loss="kl_divergence",
     ):
-        # Always use final outputs (no intermediate layers)
-        super().__init__(
-            loss=loss, teacher_layer_name=None, student_layer_name=None
-        )
         self.temperature = temperature
 
+        # Convert loss structure to functions using tree.map_structure
+        def convert_loss_to_function(loss_item):
+            if isinstance(loss_item, str):
+                loss_fn = keras.losses.get(loss_item)
+                if loss_fn is None:
+                    raise ValueError(f"Unknown loss function: {loss_item}")
+                return loss_fn
+            else:
+                return loss_item
+
+        self.loss = tree.map_structure(convert_loss_to_function, loss)
+
         # Validate temperature
         if not isinstance(self.temperature, (int, float)):
             raise ValueError(
@@ -418,10 +394,10 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """
         # Apply temperature scaling using tree.map_structure
         teacher_scaled = tree.map_structure(
-            lambda x: x / self.temperature, teacher_outputs
+            lambda x: keras.ops.divide(x, self.temperature), teacher_outputs
         )
         student_scaled = tree.map_structure(
-            lambda x: x / self.temperature, student_outputs
+            lambda x: keras.ops.divide(x, self.temperature), student_outputs
         )
 
         # Apply loss function(s) to corresponding outputs
@@ -451,19 +427,14 @@ def apply_loss(loss_fn, teacher_logits, student_logits):
 
     def get_config(self):
         """Get configuration for serialization."""
-        config = super().get_config()
-        config["temperature"] = self.temperature
-        return config
+        return {
+            "temperature": self.temperature,
+            "loss": serialization_lib.serialize_keras_object(self.loss),
+        }
 
     @classmethod
     def from_config(cls, config):
         """Create instance from configuration."""
         config = config.copy()
         config["loss"] = keras.losses.deserialize(config["loss"])
-
-        # Filter out parameters that LogitsDistillation doesn't accept
-        # (inherited from FeatureDistillation's get_config)
-        config.pop("teacher_layer_name", None)
-        config.pop("student_layer_name", None)
-
         return cls(**config)
diff --git a/keras/src/distillation/distillation_loss_test.py b/keras/src/distillation/distillation_loss_test.py
index 38af50fee619..c40399926e54 100644
--- a/keras/src/distillation/distillation_loss_test.py
+++ b/keras/src/distillation/distillation_loss_test.py
@@ -13,9 +13,7 @@ class TestLogitsDistillation(TestCase):
     """Test cases for LogitsDistillation strategy."""
 
     def test_logits_distillation_basic(self):
-        """Test basic logits distillation loss computation."""
-        strategy = LogitsDistillation(temperature=2.0)
-
+        """Test basic logits distillation structure validation."""
         # Create dummy logits
         teacher_logits = keras.ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
@@ -24,13 +22,8 @@ def test_logits_distillation_basic(self):
             np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
         )
 
-        # Compute loss
-        loss = strategy.compute_loss(teacher_logits, student_logits)
-
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
-        self.assertTrue(keras.ops.isfinite(loss))
-        self.assertGreater(loss, 0.0)
+        # Verify that teacher and student outputs have the same structure
+        keras.tree.assert_same_structure(teacher_logits, student_logits)
 
 
 @pytest.mark.requires_trainable_backend
@@ -38,9 +31,7 @@ class TestFeatureDistillation(TestCase):
     """Test cases for FeatureDistillation strategy."""
 
     def test_feature_distillation_basic(self):
-        """Test basic feature distillation loss computation."""
-        strategy = FeatureDistillation(loss="mse")
-
+        """Test basic feature distillation structure validation."""
         # Create dummy features
         teacher_features = keras.ops.convert_to_tensor(
             np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
@@ -49,13 +40,8 @@ def test_feature_distillation_basic(self):
             np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
         )
 
-        # Compute loss
-        loss = strategy.compute_loss(teacher_features, student_features)
-
-        # Check that loss is a scalar tensor
-        self.assertEqual(len(loss.shape), 0)
-        self.assertTrue(keras.ops.isfinite(loss))
-        self.assertGreater(loss, 0.0)
+        # Verify that teacher and student outputs have the same structure
+        keras.tree.assert_same_structure(teacher_features, student_features)
 
 
 @pytest.mark.requires_trainable_backend
@@ -109,8 +95,12 @@ def test_logits_distillation_end_to_end(self):
             student=student,
             strategies=LogitsDistillation(temperature=3.0),
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
             metrics=["accuracy"],
         )
 
@@ -132,7 +122,7 @@ def test_logits_distillation_end_to_end(self):
         self.assertEqual(predictions.shape, (5, 10))
 
         # Test student model access
-        student_model = distiller.student_model
+        student_model = distiller.student
         self.assertIsInstance(student_model, keras.Model)
 
     def test_feature_distillation_end_to_end(self):
@@ -178,8 +168,13 @@ def test_feature_distillation_end_to_end(self):
                 student_layer_name="student_dense_1",
             ),
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Create test data
@@ -260,8 +255,13 @@ def test_multi_strategy_distillation_end_to_end(self):
             strategies=strategies,
             strategy_weights=[1.0, 0.5, 0.3],
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Create test data
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 4add4fe781e1..5916ffd699a6 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -25,15 +25,16 @@ class Distiller(Model):
             have the same length as `strategies`. If None, equal weights used.
         student_loss_weight: Weight for the student's supervised loss component.
             Must be between 0 and 1. Defaults to 0.5.
-        optimizer: Optimizer for training the student model. Can be a string
-            identifier (e.g., `'adam'`) or an optimizer instance.
-        student_loss: Loss function for the student's supervised learning
-            component. Can be a string identifier or a loss function instance.
-        metrics: List of metrics to track during training.
         name: Name for the distiller model. Defaults to `"distiller"`.
         **kwargs: Additional keyword arguments passed to the parent `Model`
             class.
 
+    Attributes:
+        student: The student model being trained. Access this to get the trained
+            student model for independent use after distillation training.
+        teacher: The teacher model providing knowledge. This model is frozen
+            during training.
+
     Examples:
 
     ```python
@@ -50,15 +51,20 @@ class Distiller(Model):
         teacher=teacher,
         student=student,
         strategies=LogitsDistillation(temperature=3.0),
+    )
+
+    # Compile the distiller (like any Keras model)
+    distiller.compile(
         optimizer='adam',
-        student_loss='sparse_categorical_crossentropy'
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy']
     )
 
     # Train the distiller
     distiller.fit(x_train, y_train, epochs=10)
 
     # Access the trained student model
-    trained_student = distiller.student_model
+    trained_student = distiller.student
 
     # Multiple distillation strategies
     distiller = Distiller(
@@ -72,8 +78,13 @@ class Distiller(Model):
             )
         ],
         strategy_weights=[1.0, 0.5],
+    )
+
+    # Compile with custom settings
+    distiller.compile(
         optimizer='adam',
-        student_loss='sparse_categorical_crossentropy'
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy']
     )
     ```
     """
@@ -85,9 +96,6 @@ def __init__(
         strategies,
         strategy_weights=None,
         student_loss_weight=0.5,
-        optimizer="adam",
-        student_loss="sparse_categorical_crossentropy",
-        metrics=None,
         name="distiller",
         **kwargs,
     ):
@@ -113,35 +121,6 @@ def __init__(
             )
         self.student_loss_weight = student_loss_weight
 
-        # Validate metrics parameter
-        if metrics is not None and not isinstance(metrics, (list, tuple)):
-            raise ValueError(
-                f"metrics must be a list or tuple, got {type(metrics)}"
-            )
-
-            # Convert string loss to function using tree.map_structure
-
-        def convert_loss_to_function(loss):
-            if isinstance(loss, str):
-                loss_fn = keras.losses.get(loss)
-                if loss_fn is None:
-                    raise ValueError(
-                        f"Unknown loss function: '{loss}'. "
-                        "Please provide a valid loss function name or instance."
-                    )
-                return loss_fn
-            elif loss is None:
-                raise ValueError(
-                    "Student loss function cannot be None. "
-                    "Please provide a valid 'student_loss' parameter."
-                )
-            else:
-                return loss
-
-        self._student_loss = tree.map_structure(
-            convert_loss_to_function, student_loss
-        )
-
         # Handle strategies configuration
         if strategies is None:
             raise ValueError(
@@ -183,9 +162,6 @@ def convert_loss_to_function(loss):
         )
         self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
 
-        # Initialize the model - compile with provided parameters
-        self.compile(optimizer=optimizer, loss=student_loss, metrics=metrics)
-
     def _validate_models(self, teacher, student):
         """Validate that teacher and student models are compatible."""
         # Basic model type validation
@@ -223,36 +199,14 @@ def _validate_input_compatibility(self, teacher, student):
         if teacher_inputs is None or student_inputs is None:
             return
 
-        # Handle single input case
-        if not isinstance(teacher_inputs, (list, tuple)):
-            teacher_inputs = [teacher_inputs]
-        if not isinstance(student_inputs, (list, tuple)):
-            student_inputs = [student_inputs]
-
-        # Check number of inputs
-        if len(teacher_inputs) != len(student_inputs):
-            raise ValueError(
-                f"Teacher and student must have the same number of inputs. "
-                f"Teacher has {len(teacher_inputs)} inputs, "
-                f"student has {len(student_inputs)} inputs."
-            )
-
-        # Check input shapes
-        for i, (teacher_input, student_input) in enumerate(
-            zip(teacher_inputs, student_inputs)
-        ):
-            teacher_shape = teacher_input.shape
-            student_shape = student_input.shape
-
-            # Check if shapes are compatible (allowing for batch dimension
-            # flexibility)
-            if not self._shapes_are_compatible(teacher_shape, student_shape):
-                raise ValueError(
-                    f"Input {i} shapes are incompatible. "
-                    f"Teacher input shape: {teacher_shape}, "
-                    f"Student input shape: {student_shape}. "
-                    f"All dimensions except batch size must match."
-                )
+        # Validate input structures and shapes
+        tree.map_structure(
+            lambda ti, si: self._assert_shapes_are_compatible(
+                ti.shape, si.shape, "input"
+            ),
+            teacher_inputs,
+            student_inputs,
+        )
 
     def _validate_output_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible output shapes."""
@@ -264,36 +218,25 @@ def _validate_output_compatibility(self, teacher, student):
         if teacher_outputs is None or student_outputs is None:
             return
 
-        # Handle single output case
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
+        # Validate output structures and shapes
+        tree.map_structure(
+            lambda to, so: self._assert_shapes_are_compatible(
+                to.shape, so.shape, "output"
+            ),
+            teacher_outputs,
+            student_outputs,
+        )
 
-        # Check number of outputs
-        if len(teacher_outputs) != len(student_outputs):
+    def _assert_same_dtype(self, teacher_dtype, student_dtype, context):
+        """Assert that teacher and student dtypes are the same."""
+        if teacher_dtype != student_dtype:
             raise ValueError(
-                f"Teacher and student must have the same number of outputs. "
-                f"Teacher has {len(teacher_outputs)} outputs, "
-                f"student has {len(student_outputs)} outputs."
+                f"Teacher and student {context} dtypes are incompatible. "
+                f"Teacher {context} dtype: {teacher_dtype}, "
+                f"Student {context} dtype: {student_dtype}. "
+                f"Both models must use the same data type."
             )
 
-        # Check output shapes
-        for i, (teacher_output, student_output) in enumerate(
-            zip(teacher_outputs, student_outputs)
-        ):
-            teacher_shape = teacher_output.shape
-            student_shape = student_output.shape
-
-            # For distillation, output shapes should be compatible
-            if not self._shapes_are_compatible(teacher_shape, student_shape):
-                raise ValueError(
-                    f"Output {i} shapes are incompatible. "
-                    f"Teacher output shape: {teacher_shape}, "
-                    f"Student output shape: {student_shape}. "
-                    f"All dimensions except batch size must match."
-                )
-
     def _validate_dtype_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible data types."""
         # If symbolic tensors are not available (subclassed models), skip.
@@ -301,68 +244,53 @@ def _validate_dtype_compatibility(self, teacher, student):
             return
         if teacher.inputs is None or student.inputs is None:
             return
-        teacher_dtypes = [input.dtype for input in teacher.inputs]
-        student_dtypes = [input.dtype for input in student.inputs]
 
         # Check input dtypes
-        for i, (teacher_dtype, student_dtype) in enumerate(
-            zip(teacher_dtypes, student_dtypes)
-        ):
-            if teacher_dtype != student_dtype:
-                raise ValueError(
-                    f"Input {i} data types are incompatible. "
-                    f"Teacher dtype: {teacher_dtype}, "
-                    f"Student dtype: {student_dtype}."
-                )
+        tree.map_structure(
+            lambda ti, si: self._assert_same_dtype(ti.dtype, si.dtype, "input"),
+            teacher.inputs,
+            student.inputs,
+        )
 
         # Check output dtypes
-        teacher_output_dtypes = [output.dtype for output in teacher.outputs]
-        student_output_dtypes = [output.dtype for output in student.outputs]
+        if not hasattr(teacher, "outputs") or not hasattr(student, "outputs"):
+            return
+        if teacher.outputs is None or student.outputs is None:
+            return
 
-        for i, (teacher_dtype, student_dtype) in enumerate(
-            zip(teacher_output_dtypes, student_output_dtypes)
-        ):
-            if teacher_dtype != student_dtype:
-                raise ValueError(
-                    f"Output {i} data types are incompatible. "
-                    f"Teacher output dtype: {teacher_dtype}, "
-                    f"Student output dtype: {student_dtype}. "
-                    f"Both models must use the same data type."
-                )
+        tree.map_structure(
+            lambda to, so: self._assert_same_dtype(
+                to.dtype, so.dtype, "output"
+            ),
+            teacher.outputs,
+            student.outputs,
+        )
 
     def _validate_strategy_compatibility(self, teacher, student, strategy):
         """Validate that the strategy is compatible with the teacher and student
         models."""
-        if hasattr(strategy, "validate_model_compatibility"):
-            strategy.validate_model_compatibility(teacher, student)
+        strategy.validate_model_compatibility(teacher, student)
 
-    def _shapes_are_compatible(self, shape1, shape2):
-        """Check if two shapes are compatible (allowing for batch dimension
+    def _assert_shapes_are_compatible(self, shape1, shape2, context):
+        """Assert that two shapes are compatible (allowing for batch dimension
         flexibility)."""
-        # Convert to lists for easier handling
-        if hasattr(shape1, "as_list"):
-            shape1 = shape1.as_list()
-        elif hasattr(shape1, "__iter__"):
-            shape1 = list(shape1)
-        else:
-            shape1 = [shape1]
-
-        if hasattr(shape2, "as_list"):
-            shape2 = shape2.as_list()
-        elif hasattr(shape2, "__iter__"):
-            shape2 = list(shape2)
-        else:
-            shape2 = [shape2]
-
         # Check if they have the same number of dimensions
         if len(shape1) != len(shape2):
-            return False
+            raise ValueError(
+                f"Teacher and student {context} shapes have different number "
+                f"of dimensions. Teacher {context} shape: {shape1}, "
+                f"Student {context} shape: {shape2}."
+            )
 
-        # Check all dimensions except the first (batch dimension)
-        for dim1, dim2 in zip(shape1[1:], shape2[1:]):
+        # Check all dimensions (including batch dimension for distillation)
+        for dim1, dim2 in zip(shape1, shape2):
             if dim1 is not None and dim2 is not None and dim1 != dim2:
-                return False
-        return True
+                raise ValueError(
+                    f"Teacher and student {context} shapes are incompatible. "
+                    f"Teacher {context} shape: {shape1}, "
+                    f"Student {context} shape: {shape2}. "
+                    f"All dimensions must match for distillation."
+                )
 
     def _create_multi_feature_extractors(self):
         """Create feature extractors for efficient multi-layer extraction."""
@@ -385,21 +313,29 @@ def _create_multi_feature_extractors(self):
                     student_layer_names.append(strategy.student_layer_name)
 
         # Create multi-output feature extractors if needed
-        self._teacher_feature_extractor = None
-        self._student_feature_extractor = None
+        self._teacher_feature_extractor = (
+            self._create_feature_extractor(self.teacher, teacher_layer_names)
+            if teacher_layer_names
+            else None
+        )
 
-        if teacher_layer_names:
-            self._create_feature_extractor(
-                self.teacher, teacher_layer_names, "teacher"
-            )
+        self._student_feature_extractor = (
+            self._create_feature_extractor(self.student, student_layer_names)
+            if student_layer_names
+            else None
+        )
 
-        if student_layer_names:
-            self._create_feature_extractor(
-                self.student, student_layer_names, "student"
-            )
+    def _create_feature_extractor(self, model, layer_names):
+        """Create a feature extractor for a model.
+
+        Args:
+            model: The model to create an extractor for.
+            layer_names: List of layer names to extract features from.
 
-    def _create_feature_extractor(self, model, layer_names, model_type):
-        """Create feature extractor for a model."""
+        Returns:
+            keras.Model: Feature extractor that returns a dict of features,
+            or None if extractor creation fails.
+        """
         try:
             # Get model inputs and final output
             if isinstance(model, keras.Sequential):
@@ -408,45 +344,30 @@ def _create_feature_extractor(self, model, layer_names, model_type):
             else:
                 if not hasattr(model, "inputs") or model.inputs is None:
                     raise ValueError(
-                        f"{model_type} model has no defined inputs"
+                        f"{model.name} model has no defined inputs"
                     )
                 if not hasattr(model, "output") or model.output is None:
                     raise ValueError(
-                        f"{model_type} model has no defined output"
+                        f"{model.name} model has no defined output"
                     )
                 final_output = model.output
                 inputs = model.inputs
 
             # Collect outputs
-            outputs = [final_output]  # Always include final output
-            output_names = ["final_output"]
-
+            outputs = {"final_output": final_output}
             for layer_name in layer_names:
                 layer = model.get_layer(name=layer_name)
-                outputs.append(layer.output)
-                output_names.append(layer_name)
+                outputs[layer_name] = layer.output
 
-            # Create extractor
-            extractor = keras.Model(
+            # Create and return extractor
+            return keras.Model(
                 inputs=inputs,
                 outputs=outputs,
                 name=f"{model.name}_multi_feature_extractor",
             )
-
-            # Store based on model type
-            if model_type == "teacher":
-                self._teacher_feature_extractor = extractor
-                self._teacher_output_names = output_names
-            else:
-                self._student_feature_extractor = extractor
-                self._student_output_names = output_names
-
         except (ValueError, AttributeError):
             # Fallback for subclassed models
-            if model_type == "teacher":
-                self._teacher_feature_extractor = None
-            else:
-                self._student_feature_extractor = None
+            return None
 
     def _extract_all_teacher_features(self, x):
         """Extract all teacher features in a single forward pass.
@@ -458,18 +379,8 @@ def _extract_all_teacher_features(self, x):
             Dict mapping layer names to their outputs.
         """
         if self._teacher_feature_extractor is not None:
-            # Use efficient multi-output extractor
-            feature_outputs = self._teacher_feature_extractor(x, training=False)
-            if not isinstance(feature_outputs, (list, tuple)):
-                feature_outputs = [feature_outputs]
-
-            # Map outputs to layer names
-            features = {}
-            for name, output in zip(
-                self._teacher_output_names, feature_outputs
-            ):
-                features[name] = output
-            return features
+            # Use efficient multi-output extractor (returns dict directly)
+            return self._teacher_feature_extractor(x, training=False)
         else:
             # Fallback: just get final output for LogitsDistillation
             return {"final_output": self.teacher(x, training=False)}
@@ -485,18 +396,8 @@ def _extract_all_student_features(self, x, y_pred):
             Dict mapping layer names to their outputs.
         """
         if self._student_feature_extractor is not None:
-            # Use efficient multi-output extractor
-            feature_outputs = self._student_feature_extractor(x, training=True)
-            if not isinstance(feature_outputs, (list, tuple)):
-                feature_outputs = [feature_outputs]
-
-            # Map outputs to layer names
-            features = {}
-            for name, output in zip(
-                self._student_output_names, feature_outputs
-            ):
-                features[name] = output
-            return features
+            # Use efficient multi-output extractor (returns dict directly)
+            return self._student_feature_extractor(x, training=True)
         else:
             # Fallback: use y_pred for final output to avoid recomputation
             return {"final_output": y_pred}
@@ -530,13 +431,42 @@ def compile(self, optimizer="adam", loss=None, metrics=None, **kwargs):
 
         Args:
             optimizer: Optimizer for training the student model.
-            loss: Student loss function (stored for serialization).
+            loss: Student loss function for the student's supervised learning.
+                Can be a string identifier or a loss function instance.
             metrics: Additional metrics to track during training.
             **kwargs: Additional arguments passed to parent compile.
         """
-        # Store the student loss for serialization (not used in training)
+        # Validate and convert student loss
+        if loss is None:
+            raise ValueError(
+                "Student loss function cannot be None. "
+                "Please provide a valid 'loss' parameter."
+            )
+
+        # Convert string loss to function using tree.map_structure
+        def convert_loss_to_function(loss_item):
+            if isinstance(loss_item, str):
+                loss_fn = keras.losses.get(loss_item)
+                if loss_fn is None:
+                    raise ValueError(
+                        f"Unknown loss function: '{loss_item}'. "
+                        "Please provide a valid loss function name or instance."
+                    )
+                return loss_fn
+            else:
+                return loss_item
+
+        self._student_loss = tree.map_structure(convert_loss_to_function, loss)
+
+        # Store the student loss for serialization
         self._student_loss_for_serialization = loss
 
+        # Validate metrics parameter
+        if metrics is not None and not isinstance(metrics, (list, tuple)):
+            raise ValueError(
+                f"metrics must be a list or tuple, got {type(metrics)}"
+            )
+
         # Compile with a dummy loss since we override compute_loss
         super().compile(
             optimizer=optimizer,
@@ -545,15 +475,6 @@ def compile(self, optimizer="adam", loss=None, metrics=None, **kwargs):
             **kwargs,
         )
 
-    @property
-    def student_model(self):
-        """The trained student model for independent use.
-
-        Returns:
-            keras.Model: The trained student model.
-        """
-        return self.student
-
     def call(self, inputs, training=None, **kwargs):
         """Forward pass returns student predictions."""
         return self.student(inputs, training=training, **kwargs)
@@ -579,44 +500,41 @@ def compute_loss(
         # Compute student loss using tree operations for dicts, manual for lists
         student_loss = 0.0
         if self.student_loss_weight > 0.0 and y is not None:
-            if isinstance(self._student_loss, dict):
-                # Dict case - check keys match at runtime
-                loss_keys = set(self._student_loss.keys())
-                y_keys = set(y.keys())
-                pred_keys = set(y_pred.keys())
-                if loss_keys != y_keys or y_keys != pred_keys:
-                    raise ValueError(
-                        f"Keys must match across loss functions, targets, and "
-                        f"predictions. Loss keys: {loss_keys}, "
-                        f"Target keys: {y_keys}, Prediction keys: {pred_keys}"
-                    )
-
-                loss_values = {
-                    key: self._student_loss[key](y[key], y_pred[key])
-                    for key in self._student_loss.keys()
-                }
-                flat_losses = tree.flatten(loss_values)
-                student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
-            elif isinstance(self._student_loss, (list, tuple)):
-                # List/tuple case - check lengths match at runtime (can change)
-                if len(y) != len(y_pred) or len(self._student_loss) != len(y):
-                    raise ValueError(
-                        f"Number of targets ({len(y)}), predictions "
-                        f"({len(y_pred)}), and loss functions "
-                        f"({len(self._student_loss)}) must match."
-                    )
-
-                loss_values = [
-                    loss_fn(y_true, y_pred_i)
-                    for loss_fn, y_true, y_pred_i in zip(
-                        self._student_loss, y, y_pred
-                    )
-                ]
+            # Use tree.map_structure for cleaner loss computation
+            try:
+                loss_values = tree.map_structure(
+                    lambda l, o, o_pred: l(o, o_pred),
+                    self._student_loss,
+                    y,
+                    y_pred,
+                )
                 flat_losses = tree.flatten(loss_values)
-                student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
-            else:
-                # Single output case
-                student_loss = self._student_loss(y, y_pred)
+                student_loss = (
+                    keras.ops.sum(keras.ops.stack(flat_losses))
+                    if len(flat_losses) > 1
+                    else flat_losses[0]
+                )
+            except (ValueError, TypeError):
+                # Fallback for TrackedDict compatibility issues
+                if isinstance(self._student_loss, dict):
+                    loss_values = {
+                        key: self._student_loss[key](y[key], y_pred[key])
+                        for key in self._student_loss.keys()
+                    }
+                    flat_losses = tree.flatten(loss_values)
+                    student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
+                elif isinstance(self._student_loss, (list, tuple)):
+                    loss_values = [
+                        loss_fn(y_true, y_pred_i)
+                        for loss_fn, y_true, y_pred_i in zip(
+                            self._student_loss, y, y_pred
+                        )
+                    ]
+                    flat_losses = tree.flatten(loss_values)
+                    student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
+                else:
+                    # Single output case
+                    student_loss = self._student_loss(y, y_pred)
 
             # Ensure student_loss is a scalar
             if hasattr(student_loss, "shape") and len(student_loss.shape) > 0:
@@ -667,7 +585,9 @@ def compute_loss(
                 )
 
                 # Apply weight and add to total
-                distillation_loss += weight * strategy_loss
+                distillation_loss = keras.ops.add(
+                    distillation_loss, keras.ops.multiply(weight, strategy_loss)
+                )
 
             # Ensure distillation_loss is a scalar
             if (
@@ -677,9 +597,12 @@ def compute_loss(
                 distillation_loss = keras.ops.mean(distillation_loss)
 
         # Combine losses
-        total_loss = (
-            self.student_loss_weight * student_loss
-            + (1.0 - self.student_loss_weight) * distillation_loss
+        total_loss = keras.ops.add(
+            keras.ops.multiply(self.student_loss_weight, student_loss),
+            keras.ops.multiply(
+                keras.ops.subtract(1.0, self.student_loss_weight),
+                distillation_loss,
+            ),
         )
 
         # Update metrics
@@ -696,31 +619,6 @@ def reset_metrics(self):
         self.distillation_loss_tracker.reset_state()
         self.total_loss_tracker.reset_state()
 
-    @property
-    def metrics(self):
-        """Return list of metrics."""
-        # Get parent metrics (from compile)
-        parent_metrics = []
-        if hasattr(super(), "metrics"):
-            parent_metrics = [
-                m
-                for m in super().metrics
-                if m
-                not in [
-                    self.total_loss_tracker,
-                    self.student_loss_tracker,
-                    self.distillation_loss_tracker,
-                ]
-            ]
-
-        # Add our custom loss trackers first
-        distillation_metrics = [
-            self.total_loss_tracker,
-            self.student_loss_tracker,
-            self.distillation_loss_tracker,
-        ]
-        return distillation_metrics + parent_metrics
-
     def get_config(self):
         """Get configuration for serialization."""
         config = super().get_config()
@@ -738,18 +636,6 @@ def get_config(self):
                 ],
                 "strategy_weights": self.strategy_weights,
                 "student_loss_weight": self.student_loss_weight,
-                # Save current state, not initial parameters
-                "optimizer": serialization_lib.serialize_keras_object(
-                    self.optimizer
-                )
-                if hasattr(self, "optimizer") and self.optimizer
-                else None,
-                "student_loss": serialization_lib.serialize_keras_object(
-                    getattr(self, "_student_loss_for_serialization", None)
-                ),
-                # Note: metrics are not easily serializable due to
-                # CompileMetrics complexity, so we skip them in serialization
-                "metrics": None,
             }
         )
         return config
@@ -771,18 +657,4 @@ def from_config(cls, config):
             for strategy in config["strategies"]
         ]
 
-        # Handle optional parameters
-        if "optimizer" in config and config["optimizer"] is not None:
-            config["optimizer"] = serialization_lib.deserialize_keras_object(
-                config["optimizer"]
-            )
-        if "student_loss" in config and config["student_loss"] is not None:
-            config["student_loss"] = serialization_lib.deserialize_keras_object(
-                config["student_loss"]
-            )
-        if "metrics" in config and config["metrics"] is not None:
-            config["metrics"] = serialization_lib.deserialize_keras_object(
-                config["metrics"]
-            )
-
         return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
index 4fd50880f60a..67025c043aaa 100644
--- a/keras/src/distillation/distiller_test.py
+++ b/keras/src/distillation/distiller_test.py
@@ -67,8 +67,12 @@ def setUp(self):
             student=self.student,
             strategies=self.strategy,
             student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        self.distiller.compile(
             optimizer="adam",
-            student_loss="sparse_categorical_crossentropy",
+            loss="sparse_categorical_crossentropy",
             metrics=["accuracy"],
         )
 
@@ -132,8 +136,6 @@ def test_teacher_freezing(self):
             student=self.student,
             strategies=self.strategy,
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(),
-            student_loss="sparse_categorical_crossentropy",
         )
 
         # Teacher should now be frozen
@@ -172,8 +174,13 @@ def test_multi_strategy_functionality(self):
             strategies=strategies,
             strategy_weights=strategy_weights,
             student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
             optimizer="adam",
-            student_loss="sparse_categorical_crossentropy",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Test that strategies are stored correctly
@@ -203,8 +210,6 @@ def test_multi_strategy_validation(self):
             student=self.student,
             strategies=strategies,
             student_loss_weight=0.5,
-            optimizer="adam",
-            student_loss="sparse_categorical_crossentropy",
         )
 
         self.assertEqual(len(distiller.strategies), 2)
@@ -217,8 +222,6 @@ def test_multi_strategy_validation(self):
                 strategies=strategies,
                 strategy_weights=[1.0],  # Wrong length
                 student_loss_weight=0.5,
-                optimizer="adam",
-                student_loss="sparse_categorical_crossentropy",
             )
 
     def test_student_loss_weighting(self):
@@ -229,8 +232,6 @@ def test_student_loss_weighting(self):
             student=self.student,
             strategies=self.strategy,
             student_loss_weight=0.0,
-            optimizer=keras.optimizers.Adam(),
-            student_loss="sparse_categorical_crossentropy",
         )
 
         # Test with student_loss_weight = 1.0 (only student loss)
@@ -239,8 +240,18 @@ def test_student_loss_weighting(self):
             student=self.student,
             strategies=self.strategy,
             student_loss_weight=1.0,
-            optimizer=keras.optimizers.Adam(),
-            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Compile both distillers
+        distiller_0.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+        distiller_1.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Test that they can be used for training without errors
@@ -279,8 +290,12 @@ def test_full_training_workflow(self):
             student=student,
             strategies=self.strategy,
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
             metrics=["accuracy"],
         )
 
@@ -350,8 +365,13 @@ def test_evaluation_workflow(self):
             student=student,
             strategies=self.strategy,
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss="sparse_categorical_crossentropy",
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
         )
 
         # Train briefly
@@ -389,8 +409,6 @@ def test_prediction_workflow(self):
             student=student,
             strategies=self.strategy,
             student_loss_weight=0.5,
-            optimizer=keras.optimizers.Adam(learning_rate=0.01),
-            student_loss="sparse_categorical_crossentropy",
         )
 
         # Make predictions
@@ -442,8 +460,6 @@ def test_distiller_serialization_and_saving(self):
             student=student,
             strategies=strategy,
             student_loss_weight=0.7,
-            optimizer=keras.optimizers.Adam(),
-            student_loss="sparse_categorical_crossentropy",
         )
 
         # Build the models by calling them
@@ -490,7 +506,6 @@ def test_distiller_serialization_and_saving(self):
 
             # Compile original distiller
             original_distiller.compile(
-                optimizer=keras.optimizers.Adam(),
                 loss="sparse_categorical_crossentropy",
             )
 

From e2f7f6b33e7708e7455afaf79260ed8087791949 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 8 Sep 2025 17:33:58 -0700
Subject: [PATCH 29/31] oh

---
 keras/src/distillation/distiller.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 5916ffd699a6..624a4924b4dd 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -590,10 +590,7 @@ def compute_loss(
                 )
 
             # Ensure distillation_loss is a scalar
-            if (
-                hasattr(distillation_loss, "shape")
-                and len(distillation_loss.shape) > 0
-            ):
+            if len(distillation_loss.shape) > 0:
                 distillation_loss = keras.ops.mean(distillation_loss)
 
         # Combine losses

From a5f4605fcfb8a130224dd9d721f5270dde2c9389 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Tue, 30 Sep 2025 14:11:07 -0700
Subject: [PATCH 30/31] address review comments

---
 keras/src/distillation/distillation_loss.py | 68 +---------------
 keras/src/distillation/distiller.py         | 86 +++++++++++----------
 2 files changed, 48 insertions(+), 106 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index c18ead423695..32477fe0ff36 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -48,18 +48,9 @@ def validate_outputs(self, teacher_outputs, student_outputs):
             ValueError: If outputs are not compatible.
         """
         # Default implementation - can be overridden by subclasses
-        if not isinstance(teacher_outputs, (list, tuple)):
-            teacher_outputs = [teacher_outputs]
-        if not isinstance(student_outputs, (list, tuple)):
-            student_outputs = [student_outputs]
-
-        if len(teacher_outputs) != len(student_outputs):
-            raise ValueError(
-                f"Teacher and student must have the same number of "
-                f"outputs. "
-                f"Teacher has {len(teacher_outputs)} outputs, "
-                f"student has {len(student_outputs)} outputs."
-            )
+        # keras.tree.assert_same_structure validates that structures match,
+        # including the number of outputs, so no additional checks needed
+        keras.tree.assert_same_structure(teacher_outputs, student_outputs)
 
     def validate_model_compatibility(self, teacher, student):
         """Validate that teacher and student models are compatible.
@@ -167,59 +158,6 @@ def validate_model_compatibility(self, teacher, student):
             except ValueError as e:
                 raise ValueError(f"In student model: {e}")
 
-    def _create_feature_extractor(self, model, layer_name):
-        """Create a feature extractor function for the specified layer.
-
-        Args:
-            model: The model to extract features from.
-            layer_name: Name of the layer to extract features from.
-                       If None, returns the original model.
-
-        Returns:
-            A keras.Model that extracts features from the specified layer.
-        """
-        if layer_name is None:
-            # Return the original model if no layer specified
-            return model
-
-        # Get the layer using Keras built-in method
-        try:
-            target_layer = model.get_layer(name=layer_name)
-        except ValueError as e:
-            raise ValueError(
-                f"Layer '{layer_name}' not found in model '{model.name}'. {e}"
-            )
-
-        # Create a new model that extracts features from the specified layer.
-        try:
-            return keras.Model(
-                inputs=model.inputs,
-                outputs=target_layer.output,
-                name=f"{model.name}_features_{layer_name}",
-            )
-        except (ValueError, AttributeError) as e:
-            # Handle the case where the model doesn't have defined inputs yet
-            error_msg = str(e).lower()
-            if (
-                "no defined inputs" in error_msg
-                or "has no defined inputs" in error_msg
-            ):
-                raise ValueError(
-                    f"Model '{model.name}' has no defined inputs yet. "
-                    f"Please call the model with some input data first to "
-                    f"build it, or use the Functional API to create models "
-                    f"with explicit inputs. For Sequential models, you can "
-                    f"call model(dummy_input) or model.build(input_shape) "
-                    f"before using FeatureDistillation."
-                )
-            else:
-                raise ValueError(
-                    f"Could not create a feature extraction model for layer "
-                    f"'{layer_name}'. This is likely because the model is a "
-                    f"subclassed model that cannot be traversed using the "
-                    f"standard layer API. Error: {e}"
-                )
-
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 624a4924b4dd..9094204e3f67 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -189,6 +189,37 @@ def _validate_models(self, teacher, student):
         # Validate data type compatibility
         self._validate_dtype_compatibility(teacher, student)
 
+    def _assert_shapes_are_compatible(self, shape1, shape2, context):
+        """Assert that two shapes are compatible (allowing for batch dimension
+        flexibility)."""
+        # Check if they have the same number of dimensions
+        if len(shape1) != len(shape2):
+            raise ValueError(
+                f"Teacher and student {context} shapes have different number "
+                f"of dimensions. Teacher {context} shape: {shape1}, "
+                f"Student {context} shape: {shape2}."
+            )
+
+        # Check all dimensions (including batch dimension for distillation)
+        for dim1, dim2 in zip(shape1, shape2):
+            if dim1 is not None and dim2 is not None and dim1 != dim2:
+                raise ValueError(
+                    f"Teacher and student {context} shapes are incompatible. "
+                    f"Teacher {context} shape: {shape1}, "
+                    f"Student {context} shape: {shape2}. "
+                    f"All dimensions must match for distillation."
+                )
+
+    def _assert_same_dtype(self, teacher_dtype, student_dtype, context):
+        """Assert that teacher and student dtypes are the same."""
+        if teacher_dtype != student_dtype:
+            raise ValueError(
+                f"Teacher and student {context} dtypes are incompatible. "
+                f"Teacher {context} dtype: {teacher_dtype}, "
+                f"Student {context} dtype: {student_dtype}. "
+                f"Both models must use the same data type."
+            )
+
     def _validate_input_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible input shapes."""
         # If symbolic tensors are not available (subclassed models), skip.
@@ -227,16 +258,6 @@ def _validate_output_compatibility(self, teacher, student):
             student_outputs,
         )
 
-    def _assert_same_dtype(self, teacher_dtype, student_dtype, context):
-        """Assert that teacher and student dtypes are the same."""
-        if teacher_dtype != student_dtype:
-            raise ValueError(
-                f"Teacher and student {context} dtypes are incompatible. "
-                f"Teacher {context} dtype: {teacher_dtype}, "
-                f"Student {context} dtype: {student_dtype}. "
-                f"Both models must use the same data type."
-            )
-
     def _validate_dtype_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible data types."""
         # If symbolic tensors are not available (subclassed models), skip.
@@ -271,27 +292,6 @@ def _validate_strategy_compatibility(self, teacher, student, strategy):
         models."""
         strategy.validate_model_compatibility(teacher, student)
 
-    def _assert_shapes_are_compatible(self, shape1, shape2, context):
-        """Assert that two shapes are compatible (allowing for batch dimension
-        flexibility)."""
-        # Check if they have the same number of dimensions
-        if len(shape1) != len(shape2):
-            raise ValueError(
-                f"Teacher and student {context} shapes have different number "
-                f"of dimensions. Teacher {context} shape: {shape1}, "
-                f"Student {context} shape: {shape2}."
-            )
-
-        # Check all dimensions (including batch dimension for distillation)
-        for dim1, dim2 in zip(shape1, shape2):
-            if dim1 is not None and dim2 is not None and dim1 != dim2:
-                raise ValueError(
-                    f"Teacher and student {context} shapes are incompatible. "
-                    f"Teacher {context} shape: {shape1}, "
-                    f"Student {context} shape: {shape2}. "
-                    f"All dimensions must match for distillation."
-                )
-
     def _create_multi_feature_extractors(self):
         """Create feature extractors for efficient multi-layer extraction."""
         # Collect all layer names needed for feature extraction
@@ -313,16 +313,11 @@ def _create_multi_feature_extractors(self):
                     student_layer_names.append(strategy.student_layer_name)
 
         # Create multi-output feature extractors if needed
-        self._teacher_feature_extractor = (
-            self._create_feature_extractor(self.teacher, teacher_layer_names)
-            if teacher_layer_names
-            else None
+        self._teacher_feature_extractor = self._create_feature_extractor(
+            self.teacher, teacher_layer_names
         )
-
-        self._student_feature_extractor = (
-            self._create_feature_extractor(self.student, student_layer_names)
-            if student_layer_names
-            else None
+        self._student_feature_extractor = self._create_feature_extractor(
+            self.student, student_layer_names
         )
 
     def _create_feature_extractor(self, model, layer_names):
@@ -334,8 +329,12 @@ def _create_feature_extractor(self, model, layer_names):
 
         Returns:
             keras.Model: Feature extractor that returns a dict of features,
-            or None if extractor creation fails.
+            or None if no layer names provided or extractor creation fails.
         """
+        # Return None if no layer names provided
+        if not layer_names:
+            return None
+
         try:
             # Get model inputs and final output
             if isinstance(model, keras.Sequential):
@@ -579,6 +578,11 @@ def compute_loss(
                     strategy_teacher_output = teacher_features["final_output"]
                     strategy_student_output = y_pred
 
+                # Validate outputs are compatible for this strategy
+                strategy.validate_outputs(
+                    strategy_teacher_output, strategy_student_output
+                )
+
                 # Compute loss for this strategy
                 strategy_loss = strategy.compute_loss(
                     strategy_teacher_output, strategy_student_output

From df077584ae477937dc983d50dbfbd9de8555f5be Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 6 Oct 2025 13:34:33 -0700
Subject: [PATCH 31/31] address review comments

---
 keras/src/distillation/distillation_loss.py | 168 +++++++++--------
 keras/src/distillation/distiller.py         | 197 ++++++--------------
 2 files changed, 148 insertions(+), 217 deletions(-)

diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
index 32477fe0ff36..7a08547572b9 100644
--- a/keras/src/distillation/distillation_loss.py
+++ b/keras/src/distillation/distillation_loss.py
@@ -4,6 +4,30 @@
 from keras.src.saving import serialization_lib
 
 
+def _convert_loss_to_function(loss_item):
+    """Convert a loss string identifier to a loss function.
+
+    Args:
+        loss_item: Either a string identifier, a loss function instance,
+            or None.
+
+    Returns:
+        A loss function instance, or None.
+
+    Raises:
+        ValueError: If the loss string identifier is unknown.
+    """
+    if loss_item is None:
+        return None
+    elif isinstance(loss_item, str):
+        loss_fn = keras.losses.get(loss_item)
+        if loss_fn is None:
+            raise ValueError(f"Unknown loss function: '{loss_item}'.")
+        return loss_fn
+    else:
+        return loss_item
+
+
 @keras_export("keras.distillation.DistillationLoss")
 class DistillationLoss:
     """Base class for distillation loss computation.
@@ -37,35 +61,23 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
     def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that teacher and student outputs are compatible.
 
-        This method ensures that the outputs from teacher and student models
-        are compatible for the specific distillation strategy. It should check
-        shapes, dimensions, and other requirements.
-
         Args:
             teacher_outputs: Outputs from the teacher model.
             student_outputs: Outputs from the student model.
         Raises:
             ValueError: If outputs are not compatible.
         """
-        # Default implementation - can be overridden by subclasses
-        # keras.tree.assert_same_structure validates that structures match,
-        # including the number of outputs, so no additional checks needed
         keras.tree.assert_same_structure(teacher_outputs, student_outputs)
 
     def validate_model_compatibility(self, teacher, student):
         """Validate that teacher and student models are compatible.
 
-        This method ensures that the teacher and student models are compatible
-        for the specific distillation strategy. It should check model structure,
-        layer availability, and other strategy-specific requirements.
-
         Args:
             teacher: The teacher model.
             student: The student model.
         Raises:
             ValueError: If models are not compatible with this strategy.
         """
-        # can be overridden by subclasses
         pass
 
 
@@ -83,7 +95,9 @@ class FeatureDistillation(DistillationLoss):
             - String identifier (e.g., 'mse', 'cosine_similarity', 'mae')
             - Keras loss instance
             - Nested structure of losses matching the layer output structure
-            Defaults to 'mse'.
+            - None to skip distillation for that output (useful for multi-output
+              models where you only want to distill some outputs)
+            At least one loss must be non-None. Defaults to 'mse'.
         teacher_layer_name: Name of the teacher layer to extract features from.
             If None, uses the final output. Defaults to None.
         student_layer_name: Name of the student layer to extract features from.
@@ -118,6 +132,11 @@ class FeatureDistillation(DistillationLoss):
     strategy = FeatureDistillation(
         loss=["mse", "cosine_similarity"]
     )
+
+    # For multi-output models, only distill some outputs
+    strategy = FeatureDistillation(
+        loss=["mse", None, "cosine_similarity"]  # Skip middle output
+    )
     ```
     """
 
@@ -126,26 +145,44 @@ def __init__(
     ):
         self.teacher_layer_name = teacher_layer_name
         self.student_layer_name = student_layer_name
+        self.loss = tree.map_structure(_convert_loss_to_function, loss)
 
-        # Convert loss structure to functions using tree.map_structure
-        def convert_loss_to_function(loss_item):
-            if isinstance(loss_item, str):
-                loss_fn = keras.losses.get(loss_item)
-                if loss_fn is None:
-                    raise ValueError(
-                        f"Unknown loss function: '{loss_item}'. "
-                        "Please provide a valid loss function name or instance."
-                    )
-                return loss_fn
-            else:
-                return loss_item
-
-        self.loss = tree.map_structure(convert_loss_to_function, loss)
+        flat_losses = tree.flatten(self.loss)
+        if all(l is None for l in flat_losses):
+            raise ValueError("At least one loss must be non-None.")
 
     def validate_model_compatibility(self, teacher, student):
         """Validate that teacher and student models are compatible for feature
         distillation."""
-        # Check if specified layers exist in the models
+        if (
+            self.teacher_layer_name is not None
+            or self.student_layer_name is not None
+        ):
+            teacher_is_subclassed = (
+                not hasattr(teacher, "inputs") or teacher.inputs is None
+            )
+            student_is_subclassed = (
+                not hasattr(student, "inputs") or student.inputs is None
+            )
+
+            if teacher_is_subclassed or student_is_subclassed:
+                subclassed_models = []
+                if teacher_is_subclassed:
+                    subclassed_models.append("teacher")
+                if student_is_subclassed:
+                    subclassed_models.append("student")
+
+                models_str = " and ".join(subclassed_models)
+                raise ValueError(
+                    f"FeatureDistillation with specific layer names requires "
+                    f"Functional or Sequential models. The {models_str} "
+                    f"model(s) appear to be subclassed (no symbolic "
+                    f"inputs/outputs). Either use Functional/Sequential "
+                    f"models, or use FeatureDistillation without layer names "
+                    f"(to distill final outputs only), or use "
+                    f"LogitsDistillation instead."
+                )
+
         if self.teacher_layer_name is not None:
             try:
                 teacher.get_layer(name=self.teacher_layer_name)
@@ -162,69 +199,45 @@ def validate_outputs(self, teacher_outputs, student_outputs):
         """Validate that outputs are compatible for feature distillation."""
         super().validate_outputs(teacher_outputs, student_outputs)
 
-        # Validate that loss structure matches output structure
         try:
             tree.assert_same_structure(self.loss, teacher_outputs)
-            tree.assert_same_structure(self.loss, student_outputs)
         except ValueError as e:
             raise ValueError(
-                f"Loss structure must match output structure. "
+                f"Loss structure mismatch. "
                 f"Loss structure: {tree.structure(self.loss)}, "
-                f"Teacher output structure: {tree.structure(teacher_outputs)}, "
-                f"Student output structure: {tree.structure(student_outputs)}. "
+                f"Output structure: {tree.structure(teacher_outputs)}. "
                 f"Error: {e}"
             )
 
-        # For feature distillation, validate layer compatibility if specified
-        if (
-            self.teacher_layer_name is not None
-            and self.student_layer_name is not None
-        ):
-            # Validate that the specified layers exist and are compatible
-            self._validate_layer_compatibility(teacher_outputs, student_outputs)
-
-    def _validate_layer_compatibility(self, teacher_outputs, student_outputs):
-        """Validate that the specified layers are compatible for feature
-        distillation."""
-        # This method would be called by the distiller to validate layer
-        # compatibility when using feature distillation with specific layer
-        # names
-        pass
-
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute feature distillation loss using extracted features.
 
         Args:
-            teacher_outputs: Extracted features from the specified teacher
-                layer.
-            student_outputs: Extracted features from the specified student
-                layer.
+            teacher_outputs: Extracted features from teacher layer.
+            student_outputs: Extracted features from student layer.
             **kwargs: Additional arguments (ignored).
         Returns:
-            Feature distillation loss tensor.
+            Scalar distillation loss tensor.
         """
 
-        # Apply loss function(s) to corresponding features
         def apply_loss(loss_fn, teacher_features, student_features):
+            if loss_fn is None:
+                return 0.0
+
             loss = keras.ops.mean(loss_fn(teacher_features, student_features))
 
-            # Special handling for cosine similarity (convert similarity to
-            # distance)
             if (
                 hasattr(loss_fn, "__name__")
                 and "cosine" in loss_fn.__name__.lower()
             ):
-                # Convert similarity to distance: distance = 1 - similarity
-                loss = 1.0 - loss
+                loss = keras.ops.subtract(1.0, loss)
 
             return loss
 
-        # Apply losses using tree.map_structure
         loss_values = tree.map_structure(
             apply_loss, self.loss, teacher_outputs, student_outputs
         )
 
-        # Sum all losses and return scalar
         flat_losses = tree.flatten(loss_values)
         return keras.ops.sum(keras.ops.stack(flat_losses))
 
@@ -261,7 +274,9 @@ class LogitsDistillation(DistillationLoss):
               'categorical_crossentropy')
             - Keras loss instance
             - Nested structure of losses matching the model output structure
-            Defaults to 'kl_divergence'.
+            - None to skip distillation for that output (useful for multi-output
+              models where you only want to distill some outputs)
+            At least one loss must be non-None. Defaults to 'kl_divergence'.
 
     Examples:
 
@@ -286,6 +301,12 @@ class LogitsDistillation(DistillationLoss):
         temperature=3.0,
         loss=["kl_divergence", "categorical_crossentropy"]
     )
+
+    # For multi-output models, only distill some outputs
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss=["kl_divergence", None]  # Skip second output
+    )
     ```
     """
 
@@ -295,28 +316,18 @@ def __init__(
         loss="kl_divergence",
     ):
         self.temperature = temperature
+        self.loss = tree.map_structure(_convert_loss_to_function, loss)
 
-        # Convert loss structure to functions using tree.map_structure
-        def convert_loss_to_function(loss_item):
-            if isinstance(loss_item, str):
-                loss_fn = keras.losses.get(loss_item)
-                if loss_fn is None:
-                    raise ValueError(f"Unknown loss function: {loss_item}")
-                return loss_fn
-            else:
-                return loss_item
-
-        self.loss = tree.map_structure(convert_loss_to_function, loss)
+        flat_losses = tree.flatten(self.loss)
+        if all(l is None for l in flat_losses):
+            raise ValueError("At least one loss must be non-None.")
 
-        # Validate temperature
         if not isinstance(self.temperature, (int, float)):
             raise ValueError(
                 f"temperature must be a number, got {type(self.temperature)}"
             )
         if self.temperature <= 0.0:
-            raise ValueError(
-                "temperature must be > 0. Set a positive value (e.g., 1-10)."
-            )
+            raise ValueError("temperature must be positive.")
 
     def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
         """Compute distillation loss using the configured loss function.
@@ -340,6 +351,9 @@ def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
 
         # Apply loss function(s) to corresponding outputs
         def apply_loss(loss_fn, teacher_logits, student_logits):
+            if loss_fn is None:
+                return 0.0
+
             # Special handling for KL divergence (needs probabilities)
             if (
                 hasattr(loss_fn, "__name__")
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
index 9094204e3f67..ca802a775e1e 100644
--- a/keras/src/distillation/distiller.py
+++ b/keras/src/distillation/distiller.py
@@ -1,6 +1,7 @@
 import keras
 from keras.src import tree
 from keras.src.api_export import keras_export
+from keras.src.distillation.distillation_loss import _convert_loss_to_function
 from keras.src.models.model import Model
 from keras.src.saving import serialization_lib
 
@@ -124,9 +125,9 @@ def __init__(
         # Handle strategies configuration
         if strategies is None:
             raise ValueError(
-                "Must specify 'strategies'. "
-                "Please provide a valid distillation strategy such as "
-                "LogitsDistillation, FeatureDistillation, or a list."
+                "'strategies' cannot be None. Provide a distillation "
+                "strategy (e.g., LogitsDistillation or FeatureDistillation) "
+                "or a list of strategies."
             )
 
         # Convert single strategy to list for uniform handling
@@ -164,7 +165,6 @@ def __init__(
 
     def _validate_models(self, teacher, student):
         """Validate that teacher and student models are compatible."""
-        # Basic model type validation
         if not isinstance(teacher, keras.Model):
             raise ValueError(
                 f"Teacher must be a keras.Model, got {type(teacher)}"
@@ -174,55 +174,36 @@ def _validate_models(self, teacher, student):
                 f"Student must be a keras.Model, got {type(student)}"
             )
 
-        # Check if models are built
-        # Subclassed models may not be built at this point and may not expose
-        # symbolic `inputs`/`outputs`. We avoid hard failures here and rely on
-        # runtime checks during the first call/fit. When symbolic tensors are
-        # available, we perform full compatibility validation below.
-
-        # Validate input compatibility
         self._validate_input_compatibility(teacher, student)
-
-        # Validate output compatibility
         self._validate_output_compatibility(teacher, student)
-
-        # Validate data type compatibility
         self._validate_dtype_compatibility(teacher, student)
 
     def _assert_shapes_are_compatible(self, shape1, shape2, context):
-        """Assert that two shapes are compatible (allowing for batch dimension
-        flexibility)."""
-        # Check if they have the same number of dimensions
+        """Assert that two shapes are compatible."""
         if len(shape1) != len(shape2):
             raise ValueError(
-                f"Teacher and student {context} shapes have different number "
-                f"of dimensions. Teacher {context} shape: {shape1}, "
-                f"Student {context} shape: {shape2}."
+                f"Teacher and student {context} shapes have different "
+                f"dimensions. Teacher: {shape1}, Student: {shape2}."
             )
 
-        # Check all dimensions (including batch dimension for distillation)
         for dim1, dim2 in zip(shape1, shape2):
             if dim1 is not None and dim2 is not None and dim1 != dim2:
                 raise ValueError(
                     f"Teacher and student {context} shapes are incompatible. "
-                    f"Teacher {context} shape: {shape1}, "
-                    f"Student {context} shape: {shape2}. "
-                    f"All dimensions must match for distillation."
+                    f"Teacher: {shape1}, Student: {shape2}. "
+                    f"All dimensions must match."
                 )
 
     def _assert_same_dtype(self, teacher_dtype, student_dtype, context):
         """Assert that teacher and student dtypes are the same."""
         if teacher_dtype != student_dtype:
             raise ValueError(
-                f"Teacher and student {context} dtypes are incompatible. "
-                f"Teacher {context} dtype: {teacher_dtype}, "
-                f"Student {context} dtype: {student_dtype}. "
-                f"Both models must use the same data type."
+                f"Teacher and student {context} dtypes must match. "
+                f"Teacher: {teacher_dtype}, Student: {student_dtype}."
             )
 
     def _validate_input_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible input shapes."""
-        # If symbolic tensors are not available (subclassed models), skip.
         if not hasattr(teacher, "inputs") or not hasattr(student, "inputs"):
             return
         teacher_inputs = getattr(teacher, "inputs")
@@ -230,7 +211,6 @@ def _validate_input_compatibility(self, teacher, student):
         if teacher_inputs is None or student_inputs is None:
             return
 
-        # Validate input structures and shapes
         tree.map_structure(
             lambda ti, si: self._assert_shapes_are_compatible(
                 ti.shape, si.shape, "input"
@@ -241,7 +221,6 @@ def _validate_input_compatibility(self, teacher, student):
 
     def _validate_output_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible output shapes."""
-        # If symbolic tensors are not available (subclassed models), skip.
         if not hasattr(teacher, "outputs") or not hasattr(student, "outputs"):
             return
         teacher_outputs = getattr(teacher, "outputs")
@@ -249,7 +228,6 @@ def _validate_output_compatibility(self, teacher, student):
         if teacher_outputs is None or student_outputs is None:
             return
 
-        # Validate output structures and shapes
         tree.map_structure(
             lambda to, so: self._assert_shapes_are_compatible(
                 to.shape, so.shape, "output"
@@ -260,20 +238,17 @@ def _validate_output_compatibility(self, teacher, student):
 
     def _validate_dtype_compatibility(self, teacher, student):
         """Validate that teacher and student have compatible data types."""
-        # If symbolic tensors are not available (subclassed models), skip.
         if not hasattr(teacher, "inputs") or not hasattr(student, "inputs"):
             return
         if teacher.inputs is None or student.inputs is None:
             return
 
-        # Check input dtypes
         tree.map_structure(
             lambda ti, si: self._assert_same_dtype(ti.dtype, si.dtype, "input"),
             teacher.inputs,
             student.inputs,
         )
 
-        # Check output dtypes
         if not hasattr(teacher, "outputs") or not hasattr(student, "outputs"):
             return
         if teacher.outputs is None or student.outputs is None:
@@ -294,7 +269,6 @@ def _validate_strategy_compatibility(self, teacher, student, strategy):
 
     def _create_multi_feature_extractors(self):
         """Create feature extractors for efficient multi-layer extraction."""
-        # Collect all layer names needed for feature extraction
         teacher_layer_names = []
         student_layer_names = []
 
@@ -312,7 +286,6 @@ def _create_multi_feature_extractors(self):
                 if strategy.student_layer_name not in student_layer_names:
                     student_layer_names.append(strategy.student_layer_name)
 
-        # Create multi-output feature extractors if needed
         self._teacher_feature_extractor = self._create_feature_extractor(
             self.teacher, teacher_layer_names
         )
@@ -328,90 +301,52 @@ def _create_feature_extractor(self, model, layer_names):
             layer_names: List of layer names to extract features from.
 
         Returns:
-            keras.Model: Feature extractor that returns a dict of features,
-            or None if no layer names provided or extractor creation fails.
+            Feature extractor model or None if no layer names provided.
+
+        Raises:
+            ValueError: If model has no symbolic inputs/outputs.
         """
-        # Return None if no layer names provided
         if not layer_names:
             return None
 
-        try:
-            # Get model inputs and final output
-            if isinstance(model, keras.Sequential):
-                final_output = model.layers[-1].output
-                inputs = model.layers[0].input
-            else:
-                if not hasattr(model, "inputs") or model.inputs is None:
-                    raise ValueError(
-                        f"{model.name} model has no defined inputs"
-                    )
-                if not hasattr(model, "output") or model.output is None:
-                    raise ValueError(
-                        f"{model.name} model has no defined output"
-                    )
-                final_output = model.output
-                inputs = model.inputs
-
-            # Collect outputs
-            outputs = {"final_output": final_output}
-            for layer_name in layer_names:
-                layer = model.get_layer(name=layer_name)
-                outputs[layer_name] = layer.output
-
-            # Create and return extractor
-            return keras.Model(
-                inputs=inputs,
-                outputs=outputs,
-                name=f"{model.name}_multi_feature_extractor",
+        if not hasattr(model, "inputs") or model.inputs is None:
+            raise ValueError(
+                f"Cannot create feature extractor for {model.name}. "
+                f"The model has no symbolic inputs attribute."
             )
-        except (ValueError, AttributeError):
-            # Fallback for subclassed models
-            return None
 
-    def _extract_all_teacher_features(self, x):
-        """Extract all teacher features in a single forward pass.
+        if isinstance(model, keras.Sequential):
+            final_output = model.layers[-1].output
+        else:
+            final_output = model.output
 
-        Args:
-            x: Input data.
+        outputs = {"final_output": final_output}
+        for layer_name in layer_names:
+            layer = model.get_layer(name=layer_name)
+            outputs[layer_name] = layer.output
 
-        Returns:
-            Dict mapping layer names to their outputs.
-        """
+        return keras.Model(
+            inputs=model.inputs,
+            outputs=outputs,
+            name=f"{model.name}_multi_feature_extractor",
+        )
+
+    def _extract_all_teacher_features(self, x):
+        """Extract all teacher features in a single forward pass."""
         if self._teacher_feature_extractor is not None:
-            # Use efficient multi-output extractor (returns dict directly)
             return self._teacher_feature_extractor(x, training=False)
         else:
-            # Fallback: just get final output for LogitsDistillation
             return {"final_output": self.teacher(x, training=False)}
 
     def _extract_all_student_features(self, x, y_pred):
-        """Extract all student features in a single forward pass.
-
-        Args:
-            x: Input data.
-            y_pred: Student predictions from forward pass.
-
-        Returns:
-            Dict mapping layer names to their outputs.
-        """
+        """Extract all student features in a single forward pass."""
         if self._student_feature_extractor is not None:
-            # Use efficient multi-output extractor (returns dict directly)
             return self._student_feature_extractor(x, training=True)
         else:
-            # Fallback: use y_pred for final output to avoid recomputation
             return {"final_output": y_pred}
 
     def _get_strategy_features(self, strategy, all_features, is_teacher):
-        """Get the specific features needed by a strategy.
-
-        Args:
-            strategy: The FeatureDistillation strategy.
-            all_features: Dict of all extracted features.
-            is_teacher: Whether these are teacher features.
-
-        Returns:
-            The specific features needed by this strategy.
-        """
+        """Get the specific features needed by a strategy."""
         if is_teacher:
             layer_name = strategy.teacher_layer_name or "final_output"
         else:
@@ -419,8 +354,8 @@ def _get_strategy_features(self, strategy, all_features, is_teacher):
 
         if layer_name not in all_features:
             raise ValueError(
-                f"Layer '{layer_name}' features not found in extracted "
-                f"features. Available features: {list(all_features.keys())}"
+                f"Layer '{layer_name}' not found in extracted features. "
+                f"Available: {list(all_features.keys())}"
             )
 
         return all_features[layer_name]
@@ -435,41 +370,20 @@ def compile(self, optimizer="adam", loss=None, metrics=None, **kwargs):
             metrics: Additional metrics to track during training.
             **kwargs: Additional arguments passed to parent compile.
         """
-        # Validate and convert student loss
         if loss is None:
-            raise ValueError(
-                "Student loss function cannot be None. "
-                "Please provide a valid 'loss' parameter."
-            )
+            raise ValueError("'loss' cannot be None.")
 
-        # Convert string loss to function using tree.map_structure
-        def convert_loss_to_function(loss_item):
-            if isinstance(loss_item, str):
-                loss_fn = keras.losses.get(loss_item)
-                if loss_fn is None:
-                    raise ValueError(
-                        f"Unknown loss function: '{loss_item}'. "
-                        "Please provide a valid loss function name or instance."
-                    )
-                return loss_fn
-            else:
-                return loss_item
-
-        self._student_loss = tree.map_structure(convert_loss_to_function, loss)
-
-        # Store the student loss for serialization
+        self._student_loss = tree.map_structure(_convert_loss_to_function, loss)
         self._student_loss_for_serialization = loss
 
-        # Validate metrics parameter
         if metrics is not None and not isinstance(metrics, (list, tuple)):
             raise ValueError(
                 f"metrics must be a list or tuple, got {type(metrics)}"
             )
 
-        # Compile with a dummy loss since we override compute_loss
         super().compile(
             optimizer=optimizer,
-            loss=None,  # We handle loss in compute_loss
+            loss=None,
             metrics=metrics,
             **kwargs,
         )
@@ -561,16 +475,12 @@ def compute_loss(
                             strategy, student_features, is_teacher=False
                         )
                     except ValueError as e:
-                        # Provide more helpful error message for feature
-                        # extraction failures
+                        # Re-raise with context about which strategy failed
                         raise RuntimeError(
-                            f"FeatureDistillation failed for strategy "
-                            f"targeting teacher layer "
+                            f"Failed to extract features for "
+                            f"FeatureDistillation targeting teacher layer "
                             f"'{strategy.teacher_layer_name}' and student "
-                            f"layer '{strategy.student_layer_name}'. This can "
-                            f"happen with subclassed models that haven't "
-                            f"been built properly. Consider using only "
-                            f"LogitsDistillation for such models. "
+                            f"layer '{strategy.student_layer_name}'. "
                             f"Original error: {e}"
                         ) from e
                 else:
@@ -588,15 +498,22 @@ def compute_loss(
                     strategy_teacher_output, strategy_student_output
                 )
 
+                # Validate that strategy returns a scalar
+                if (
+                    hasattr(strategy_loss, "shape")
+                    and len(strategy_loss.shape) > 0
+                ):
+                    raise ValueError(
+                        f"Strategy {strategy.__class__.__name__} returned a "
+                        f"non-scalar loss with shape {strategy_loss.shape}. "
+                        f"The compute_loss method must return a scalar tensor."
+                    )
+
                 # Apply weight and add to total
                 distillation_loss = keras.ops.add(
                     distillation_loss, keras.ops.multiply(weight, strategy_loss)
                 )
 
-            # Ensure distillation_loss is a scalar
-            if len(distillation_loss.shape) > 0:
-                distillation_loss = keras.ops.mean(distillation_loss)
-
         # Combine losses
         total_loss = keras.ops.add(
             keras.ops.multiply(self.student_loss_weight, student_loss),