diff --git a/keras/api/__init__.py b/keras/api/__init__.py
index dee6cea5bb1..13343791723 100644
--- a/keras/api/__init__.py
+++ b/keras/api/__init__.py
@@ -12,6 +12,7 @@
 from keras import config as config
 from keras import constraints as constraints
 from keras import datasets as datasets
+from keras import distillation as distillation
 from keras import distribution as distribution
 from keras import dtype_policies as dtype_policies
 from keras import export as export
diff --git a/keras/api/_tf_keras/keras/__init__.py b/keras/api/_tf_keras/keras/__init__.py
index 67d4738a0f3..3457f05233e 100644
--- a/keras/api/_tf_keras/keras/__init__.py
+++ b/keras/api/_tf_keras/keras/__init__.py
@@ -10,6 +10,7 @@
 from keras import config as config
 from keras import constraints as constraints
 from keras import datasets as datasets
+from keras import distillation as distillation
 from keras import distribution as distribution
 from keras import dtype_policies as dtype_policies
 from keras import export as export
diff --git a/keras/api/_tf_keras/keras/distillation/__init__.py b/keras/api/_tf_keras/keras/distillation/__init__.py
new file mode 100644
index 00000000000..7f6fcd5bcc4
--- /dev/null
+++ b/keras/api/_tf_keras/keras/distillation/__init__.py
@@ -0,0 +1,16 @@
+"""DO NOT EDIT.
+
+This file was autogenerated. Do not edit it by hand,
+since your modifications would be overwritten.
+"""
+
+from keras.src.distillation.distillation_loss import (
+    DistillationLoss as DistillationLoss,
+)
+from keras.src.distillation.distillation_loss import (
+    FeatureDistillation as FeatureDistillation,
+)
+from keras.src.distillation.distillation_loss import (
+    LogitsDistillation as LogitsDistillation,
+)
+from keras.src.distillation.distiller import Distiller as Distiller
diff --git a/keras/api/distillation/__init__.py b/keras/api/distillation/__init__.py
new file mode 100644
index 00000000000..7f6fcd5bcc4
--- /dev/null
+++ b/keras/api/distillation/__init__.py
@@ -0,0 +1,16 @@
+"""DO NOT EDIT.
+
+This file was autogenerated. Do not edit it by hand,
+since your modifications would be overwritten.
+"""
+
+from keras.src.distillation.distillation_loss import (
+    DistillationLoss as DistillationLoss,
+)
+from keras.src.distillation.distillation_loss import (
+    FeatureDistillation as FeatureDistillation,
+)
+from keras.src.distillation.distillation_loss import (
+    LogitsDistillation as LogitsDistillation,
+)
+from keras.src.distillation.distiller import Distiller as Distiller
diff --git a/keras/src/distillation/__init__.py b/keras/src/distillation/__init__.py
new file mode 100644
index 00000000000..c903f357118
--- /dev/null
+++ b/keras/src/distillation/__init__.py
@@ -0,0 +1 @@
+"""Distillation module for knowledge distillation in Keras."""
diff --git a/keras/src/distillation/distillation_loss.py b/keras/src/distillation/distillation_loss.py
new file mode 100644
index 00000000000..7a08547572b
--- /dev/null
+++ b/keras/src/distillation/distillation_loss.py
@@ -0,0 +1,392 @@
+import keras
+from keras.src import tree
+from keras.src.api_export import keras_export
+from keras.src.saving import serialization_lib
+
+
+def _convert_loss_to_function(loss_item):
+    """Convert a loss string identifier to a loss function.
+
+    Args:
+        loss_item: Either a string identifier, a loss function instance,
+            or None.
+
+    Returns:
+        A loss function instance, or None.
+
+    Raises:
+        ValueError: If the loss string identifier is unknown.
+    """
+    if loss_item is None:
+        return None
+    elif isinstance(loss_item, str):
+        loss_fn = keras.losses.get(loss_item)
+        if loss_fn is None:
+            raise ValueError(f"Unknown loss function: '{loss_item}'.")
+        return loss_fn
+    else:
+        return loss_item
+
+
+@keras_export("keras.distillation.DistillationLoss")
+class DistillationLoss:
+    """Base class for distillation loss computation.
+
+    Distillation losses define how to compute the distillation loss
+    between teacher and student outputs. Each loss implements a specific
+    approach to knowledge transfer, from simple logits matching to feature-based
+    distillation.
+
+    To create custom distillation losses, subclass this class and
+    override the `compute_loss` method.
+    """
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+        """Compute distillation loss between teacher and student outputs.
+
+        This method should implement the specific distillation logic for
+        transferring knowledge from teacher to student.
+
+        Args:
+            teacher_outputs: Outputs from the teacher model. Can be a single
+                tensor or a list/tuple of tensors for multi-output models.
+            student_outputs: Outputs from the student model. Can be a single
+                tensor or a list/tuple of tensors for multi-output models.
+            **kwargs: Additional arguments for custom strategies.
+        Returns:
+            Distillation loss tensor.
+        """
+        raise NotImplementedError("Subclasses must implement compute_loss")
+
+    def validate_outputs(self, teacher_outputs, student_outputs):
+        """Validate that teacher and student outputs are compatible.
+
+        Args:
+            teacher_outputs: Outputs from the teacher model.
+            student_outputs: Outputs from the student model.
+        Raises:
+            ValueError: If outputs are not compatible.
+        """
+        keras.tree.assert_same_structure(teacher_outputs, student_outputs)
+
+    def validate_model_compatibility(self, teacher, student):
+        """Validate that teacher and student models are compatible.
+
+        Args:
+            teacher: The teacher model.
+            student: The student model.
+        Raises:
+            ValueError: If models are not compatible with this strategy.
+        """
+        pass
+
+
+@keras_export("keras.distillation.FeatureDistillation")
+class FeatureDistillation(DistillationLoss):
+    """Feature distillation strategy using intermediate layer representations.
+
+    Feature distillation transfers knowledge from intermediate layers of the
+    teacher model to corresponding layers of the student model. This approach
+    helps the student learn better internal representations and often leads
+    to better performance compared to logits-only distillation.
+
+    Args:
+        loss: Loss function to use for feature distillation. Can be:
+            - String identifier (e.g., 'mse', 'cosine_similarity', 'mae')
+            - Keras loss instance
+            - Nested structure of losses matching the layer output structure
+            - None to skip distillation for that output (useful for multi-output
+              models where you only want to distill some outputs)
+            At least one loss must be non-None. Defaults to 'mse'.
+        teacher_layer_name: Name of the teacher layer to extract features from.
+            If None, uses the final output. Defaults to None.
+        student_layer_name: Name of the student layer to extract features from.
+            If None, uses the final output. Defaults to None.
+
+    Examples:
+
+    ```python
+    # Basic feature distillation from final outputs
+    strategy = FeatureDistillation(loss="mse")
+
+    # Distill from specific intermediate layers
+    strategy = FeatureDistillation(
+        loss="mse",
+        teacher_layer_name="dense_1",
+        student_layer_name="dense_1"
+    )
+
+    # Use cosine similarity for different feature sizes
+    strategy = FeatureDistillation(
+        loss="cosine_similarity",
+        teacher_layer_name="conv2d_2",
+        student_layer_name="conv2d_1"
+    )
+
+    # With custom loss instance
+    strategy = FeatureDistillation(
+        loss=keras.losses.MeanAbsoluteError()
+    )
+
+    # For multi-output models
+    strategy = FeatureDistillation(
+        loss=["mse", "cosine_similarity"]
+    )
+
+    # For multi-output models, only distill some outputs
+    strategy = FeatureDistillation(
+        loss=["mse", None, "cosine_similarity"]  # Skip middle output
+    )
+    ```
+    """
+
+    def __init__(
+        self, loss="mse", teacher_layer_name=None, student_layer_name=None
+    ):
+        self.teacher_layer_name = teacher_layer_name
+        self.student_layer_name = student_layer_name
+        self.loss = tree.map_structure(_convert_loss_to_function, loss)
+
+        flat_losses = tree.flatten(self.loss)
+        if all(l is None for l in flat_losses):
+            raise ValueError("At least one loss must be non-None.")
+
+    def validate_model_compatibility(self, teacher, student):
+        """Validate that teacher and student models are compatible for feature
+        distillation."""
+        if (
+            self.teacher_layer_name is not None
+            or self.student_layer_name is not None
+        ):
+            teacher_is_subclassed = (
+                not hasattr(teacher, "inputs") or teacher.inputs is None
+            )
+            student_is_subclassed = (
+                not hasattr(student, "inputs") or student.inputs is None
+            )
+
+            if teacher_is_subclassed or student_is_subclassed:
+                subclassed_models = []
+                if teacher_is_subclassed:
+                    subclassed_models.append("teacher")
+                if student_is_subclassed:
+                    subclassed_models.append("student")
+
+                models_str = " and ".join(subclassed_models)
+                raise ValueError(
+                    f"FeatureDistillation with specific layer names requires "
+                    f"Functional or Sequential models. The {models_str} "
+                    f"model(s) appear to be subclassed (no symbolic "
+                    f"inputs/outputs). Either use Functional/Sequential "
+                    f"models, or use FeatureDistillation without layer names "
+                    f"(to distill final outputs only), or use "
+                    f"LogitsDistillation instead."
+                )
+
+        if self.teacher_layer_name is not None:
+            try:
+                teacher.get_layer(name=self.teacher_layer_name)
+            except ValueError as e:
+                raise ValueError(f"In teacher model: {e}")
+
+        if self.student_layer_name is not None:
+            try:
+                student.get_layer(name=self.student_layer_name)
+            except ValueError as e:
+                raise ValueError(f"In student model: {e}")
+
+    def validate_outputs(self, teacher_outputs, student_outputs):
+        """Validate that outputs are compatible for feature distillation."""
+        super().validate_outputs(teacher_outputs, student_outputs)
+
+        try:
+            tree.assert_same_structure(self.loss, teacher_outputs)
+        except ValueError as e:
+            raise ValueError(
+                f"Loss structure mismatch. "
+                f"Loss structure: {tree.structure(self.loss)}, "
+                f"Output structure: {tree.structure(teacher_outputs)}. "
+                f"Error: {e}"
+            )
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+        """Compute feature distillation loss using extracted features.
+
+        Args:
+            teacher_outputs: Extracted features from teacher layer.
+            student_outputs: Extracted features from student layer.
+            **kwargs: Additional arguments (ignored).
+        Returns:
+            Scalar distillation loss tensor.
+        """
+
+        def apply_loss(loss_fn, teacher_features, student_features):
+            if loss_fn is None:
+                return 0.0
+
+            loss = keras.ops.mean(loss_fn(teacher_features, student_features))
+
+            if (
+                hasattr(loss_fn, "__name__")
+                and "cosine" in loss_fn.__name__.lower()
+            ):
+                loss = keras.ops.subtract(1.0, loss)
+
+            return loss
+
+        loss_values = tree.map_structure(
+            apply_loss, self.loss, teacher_outputs, student_outputs
+        )
+
+        flat_losses = tree.flatten(loss_values)
+        return keras.ops.sum(keras.ops.stack(flat_losses))
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        return {
+            "loss": keras.losses.serialize(self.loss),
+            "teacher_layer_name": self.teacher_layer_name,
+            "student_layer_name": self.student_layer_name,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        config = config.copy()
+        config["loss"] = keras.losses.deserialize(config["loss"])
+        return cls(**config)
+
+
+@keras_export("keras.distillation.LogitsDistillation")
+class LogitsDistillation(DistillationLoss):
+    """Distillation strategy that transfers knowledge from final model outputs.
+
+    This strategy applies temperature scaling to the teacher's logits before
+    computing the loss between teacher and student predictions. It's the most
+    common approach for knowledge distillation.
+
+    Args:
+        temperature: Temperature for softmax scaling. Higher values produce
+            softer probability distributions that are easier for the student to
+            learn. Typical values range from 3-5. Defaults to 3.0.
+        loss: Loss function to use for distillation. Can be:
+            - String identifier (e.g., 'kl_divergence',
+              'categorical_crossentropy')
+            - Keras loss instance
+            - Nested structure of losses matching the model output structure
+            - None to skip distillation for that output (useful for multi-output
+              models where you only want to distill some outputs)
+            At least one loss must be non-None. Defaults to 'kl_divergence'.
+
+    Examples:
+
+    ```python
+    # Basic logits distillation with KL divergence
+    strategy = LogitsDistillation(temperature=3.0)
+
+    # With categorical crossentropy loss
+    strategy = LogitsDistillation(
+        temperature=4.0,
+        loss="categorical_crossentropy"
+    )
+
+    # With custom loss instance
+    strategy = LogitsDistillation(
+        temperature=4.0,
+        loss=keras.losses.CategoricalCrossentropy(from_logits=True)
+    )
+
+    # For multi-output models
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss=["kl_divergence", "categorical_crossentropy"]
+    )
+
+    # For multi-output models, only distill some outputs
+    strategy = LogitsDistillation(
+        temperature=3.0,
+        loss=["kl_divergence", None]  # Skip second output
+    )
+    ```
+    """
+
+    def __init__(
+        self,
+        temperature=3.0,
+        loss="kl_divergence",
+    ):
+        self.temperature = temperature
+        self.loss = tree.map_structure(_convert_loss_to_function, loss)
+
+        flat_losses = tree.flatten(self.loss)
+        if all(l is None for l in flat_losses):
+            raise ValueError("At least one loss must be non-None.")
+
+        if not isinstance(self.temperature, (int, float)):
+            raise ValueError(
+                f"temperature must be a number, got {type(self.temperature)}"
+            )
+        if self.temperature <= 0.0:
+            raise ValueError("temperature must be positive.")
+
+    def compute_loss(self, teacher_outputs, student_outputs, **kwargs):
+        """Compute distillation loss using the configured loss function.
+
+        Args:
+            teacher_outputs: Logits from teacher model. Can be a single tensor,
+                list/tuple of tensors, or dict of tensors.
+            student_outputs: Logits from student model. Can be a single tensor,
+                list/tuple of tensors, or dict of tensors.
+            **kwargs: Additional arguments (ignored).
+        Returns:
+            Distillation loss tensor.
+        """
+        # Apply temperature scaling using tree.map_structure
+        teacher_scaled = tree.map_structure(
+            lambda x: keras.ops.divide(x, self.temperature), teacher_outputs
+        )
+        student_scaled = tree.map_structure(
+            lambda x: keras.ops.divide(x, self.temperature), student_outputs
+        )
+
+        # Apply loss function(s) to corresponding outputs
+        def apply_loss(loss_fn, teacher_logits, student_logits):
+            if loss_fn is None:
+                return 0.0
+
+            # Special handling for KL divergence (needs probabilities)
+            if (
+                hasattr(loss_fn, "__name__")
+                and "kl" in loss_fn.__name__.lower()
+            ):
+                teacher_probs = keras.ops.softmax(teacher_logits, axis=-1)
+                student_probs = keras.ops.softmax(student_logits, axis=-1)
+                loss = keras.ops.mean(loss_fn(teacher_probs, student_probs))
+                # Scale by temperature^2 for KL (per literature)
+                return loss * (self.temperature**2)
+            else:
+                # For other losses, use logits directly
+                return keras.ops.mean(loss_fn(teacher_logits, student_logits))
+
+        # Apply losses using tree.map_structure
+        loss_values = tree.map_structure(
+            apply_loss, self.loss, teacher_scaled, student_scaled
+        )
+
+        # Sum all losses and return scalar
+        flat_losses = tree.flatten(loss_values)
+        return keras.ops.sum(keras.ops.stack(flat_losses))
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        return {
+            "temperature": self.temperature,
+            "loss": serialization_lib.serialize_keras_object(self.loss),
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        config = config.copy()
+        config["loss"] = keras.losses.deserialize(config["loss"])
+        return cls(**config)
diff --git a/keras/src/distillation/distillation_loss_test.py b/keras/src/distillation/distillation_loss_test.py
new file mode 100644
index 00000000000..c40399926e5
--- /dev/null
+++ b/keras/src/distillation/distillation_loss_test.py
@@ -0,0 +1,291 @@
+import numpy as np
+import pytest
+
+import keras
+from keras.src.distillation.distillation_loss import FeatureDistillation
+from keras.src.distillation.distillation_loss import LogitsDistillation
+from keras.src.distillation.distiller import Distiller
+from keras.src.testing import TestCase
+
+
+@pytest.mark.requires_trainable_backend
+class TestLogitsDistillation(TestCase):
+    """Test cases for LogitsDistillation strategy."""
+
+    def test_logits_distillation_basic(self):
+        """Test basic logits distillation structure validation."""
+        # Create dummy logits
+        teacher_logits = keras.ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+        )
+        student_logits = keras.ops.convert_to_tensor(
+            np.array([[2.0, 1.0, 4.0], [3.0, 6.0, 2.0]]), dtype="float32"
+        )
+
+        # Verify that teacher and student outputs have the same structure
+        keras.tree.assert_same_structure(teacher_logits, student_logits)
+
+
+@pytest.mark.requires_trainable_backend
+class TestFeatureDistillation(TestCase):
+    """Test cases for FeatureDistillation strategy."""
+
+    def test_feature_distillation_basic(self):
+        """Test basic feature distillation structure validation."""
+        # Create dummy features
+        teacher_features = keras.ops.convert_to_tensor(
+            np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), dtype="float32"
+        )
+        student_features = keras.ops.convert_to_tensor(
+            np.array([[1.1, 2.1, 3.1], [4.1, 5.1, 6.1]]), dtype="float32"
+        )
+
+        # Verify that teacher and student outputs have the same structure
+        keras.tree.assert_same_structure(teacher_features, student_features)
+
+
+@pytest.mark.requires_trainable_backend
+class TestEndToEndDistillation(TestCase):
+    """End-to-end distillation tests with real models."""
+
+    def test_logits_distillation_end_to_end(self):
+        """Test end-to-end logits distillation with real models."""
+        # Create teacher model (larger)
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    64, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(
+                    10, activation="softmax", name="teacher_output"
+                ),
+            ]
+        )
+
+        # Create student model (smaller)
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(
+                    10, activation="softmax", name="student_output"
+                ),
+            ]
+        )
+
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
+
+        # Build models to avoid JAX tracer issues
+        dummy_input = x[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=LogitsDistillation(temperature=3.0),
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Test training
+        history = distiller.fit(x, y, epochs=2, verbose=0)
+
+        # Verify training completed
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
+
+        # Verify loss values are reasonable
+        final_loss = history.history["total_loss"][-1]
+        self.assertTrue(np.isfinite(final_loss))
+        self.assertGreater(final_loss, 0.0)
+
+        # Test prediction
+        predictions = distiller.predict(x[:5], verbose=0)
+        self.assertEqual(predictions.shape, (5, 10))
+
+        # Test student model access
+        student_model = distiller.student
+        self.assertIsInstance(student_model, keras.Model)
+
+    def test_feature_distillation_end_to_end(self):
+        """Test end-to-end feature distillation with real models."""
+        # Create teacher model
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
+        )
+
+        # Create student model with compatible intermediate layer sizes
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
+        )
+
+        # Build models first
+        dummy_input = np.random.random((2, 20)).astype(np.float32)
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create distiller with feature distillation
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=FeatureDistillation(
+                loss="mse",
+                teacher_layer_name="teacher_dense_1",
+                student_layer_name="student_dense_1",
+            ),
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
+
+        # Test training
+        history = distiller.fit(x, y, epochs=2, verbose=0)
+
+        # Verify training completed
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
+
+        # Verify feature extraction worked
+        self.assertIsNotNone(distiller._teacher_feature_extractor)
+        self.assertIsNotNone(distiller._student_feature_extractor)
+
+        # Test that feature extractors have correct outputs
+        self.assertEqual(
+            len(distiller._teacher_feature_extractor.outputs), 2
+        )  # final + dense_1
+        self.assertEqual(
+            len(distiller._student_feature_extractor.outputs), 2
+        )  # final + dense_1
+
+    def test_multi_strategy_distillation_end_to_end(self):
+        """Test end-to-end distillation with multiple strategies."""
+        # Create models
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
+        )
+
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
+        )
+
+        # Build models first
+        dummy_input = np.random.random((2, 20)).astype(np.float32)
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create multiple strategies
+        strategies = [
+            LogitsDistillation(temperature=3.0),
+            FeatureDistillation(
+                loss="mse",
+                teacher_layer_name="teacher_dense_1",
+                student_layer_name="student_dense_1",
+            ),
+            FeatureDistillation(
+                loss="mse",
+                teacher_layer_name="teacher_dense_2",
+                student_layer_name="student_dense_2",
+            ),
+        ]
+
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=strategies,
+            strategy_weights=[1.0, 0.5, 0.3],
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Create test data
+        x = np.random.random((32, 20)).astype(np.float32)
+        y = np.random.randint(0, 10, (32,)).astype(np.int32)
+
+        # Test training
+        history = distiller.fit(x, y, epochs=2, verbose=0)
+
+        # Verify training completed
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
+
+        # Verify efficient feature extraction
+        self.assertIsNotNone(distiller._teacher_feature_extractor)
+        self.assertIsNotNone(distiller._student_feature_extractor)
+
+        # Should have 3 outputs: final + dense_1 + dense_2
+        self.assertEqual(len(distiller._teacher_feature_extractor.outputs), 3)
+        self.assertEqual(len(distiller._student_feature_extractor.outputs), 3)
+
+        # Test that loss decreases (learning is happening)
+        initial_loss = history.history["total_loss"][0]
+        final_loss = history.history["total_loss"][-1]
+        self.assertTrue(np.isfinite(initial_loss))
+        self.assertTrue(np.isfinite(final_loss))
diff --git a/keras/src/distillation/distiller.py b/keras/src/distillation/distiller.py
new file mode 100644
index 00000000000..ca802a775e1
--- /dev/null
+++ b/keras/src/distillation/distiller.py
@@ -0,0 +1,578 @@
+import keras
+from keras.src import tree
+from keras.src.api_export import keras_export
+from keras.src.distillation.distillation_loss import _convert_loss_to_function
+from keras.src.models.model import Model
+from keras.src.saving import serialization_lib
+
+
+@keras_export("keras.distillation.Distiller")
+class Distiller(Model):
+    """Distillation model for transferring knowledge from teacher to student.
+
+    Knowledge distillation transfers knowledge from a large, complex model
+    (teacher) to a smaller, simpler model (student). The student learns
+    from both ground truth labels and the teacher's predictions, often
+    achieving better performance than training on labels alone.
+
+    Args:
+        teacher: A trained `keras.Model` that serves as the knowledge source.
+            The teacher model is frozen during distillation.
+        student: A `keras.Model` to be trained through distillation.
+        strategies: List of distillation strategies to apply. Can be a single
+            strategy or a list of strategies like `LogitsDistillation`,
+            `FeatureDistillation`, or custom distillation strategies.
+        strategy_weights: List of weights for each distillation strategy. Must
+            have the same length as `strategies`. If None, equal weights used.
+        student_loss_weight: Weight for the student's supervised loss component.
+            Must be between 0 and 1. Defaults to 0.5.
+        name: Name for the distiller model. Defaults to `"distiller"`.
+        **kwargs: Additional keyword arguments passed to the parent `Model`
+            class.
+
+    Attributes:
+        student: The student model being trained. Access this to get the trained
+            student model for independent use after distillation training.
+        teacher: The teacher model providing knowledge. This model is frozen
+            during training.
+
+    Examples:
+
+    ```python
+    # Basic distillation with KerasHub models
+    import keras_hub as hub
+
+    teacher = hub.models.CausalLM.from_preset("gemma_2b_en")
+    student = hub.models.CausalLM.from_preset(
+        "gemma_1.1_2b_en", load_weights=False
+    )
+
+    # Single distillation strategy
+    distiller = Distiller(
+        teacher=teacher,
+        student=student,
+        strategies=LogitsDistillation(temperature=3.0),
+    )
+
+    # Compile the distiller (like any Keras model)
+    distiller.compile(
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy']
+    )
+
+    # Train the distiller
+    distiller.fit(x_train, y_train, epochs=10)
+
+    # Access the trained student model
+    trained_student = distiller.student
+
+    # Multiple distillation strategies
+    distiller = Distiller(
+        teacher=teacher,
+        student=student,
+        strategies=[
+            LogitsDistillation(temperature=3.0),
+            FeatureDistillation(
+                teacher_layer_name="dense_1",
+                student_layer_name="dense_1"
+            )
+        ],
+        strategy_weights=[1.0, 0.5],
+    )
+
+    # Compile with custom settings
+    distiller.compile(
+        optimizer='adam',
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy']
+    )
+    ```
+    """
+
+    def __init__(
+        self,
+        teacher,
+        student,
+        strategies,
+        strategy_weights=None,
+        student_loss_weight=0.5,
+        name="distiller",
+        **kwargs,
+    ):
+        super().__init__(name=name, **kwargs)
+
+        # Validate inputs
+        self._validate_models(teacher, student)
+
+        # Store configuration
+        self.teacher = teacher
+        self.student = student
+
+        # Validate student_loss_weight
+        if not isinstance(student_loss_weight, (int, float)):
+            raise ValueError(
+                f"student_loss_weight must be a number, got "
+                f"{type(student_loss_weight)}"
+            )
+        if student_loss_weight < 0.0 or student_loss_weight > 1.0:
+            raise ValueError(
+                f"student_loss_weight must be between 0.0 and 1.0, "
+                f"got {student_loss_weight}"
+            )
+        self.student_loss_weight = student_loss_weight
+
+        # Handle strategies configuration
+        if strategies is None:
+            raise ValueError(
+                "'strategies' cannot be None. Provide a distillation "
+                "strategy (e.g., LogitsDistillation or FeatureDistillation) "
+                "or a list of strategies."
+            )
+
+        # Convert single strategy to list for uniform handling
+        if not isinstance(strategies, (list, tuple)):
+            self.strategies = [strategies]
+            self.strategy_weights = [1.0]
+        else:
+            self.strategies = strategies
+            # Set default weights if not provided
+            if strategy_weights is None:
+                self.strategy_weights = [1.0] * len(strategies)
+            else:
+                if len(strategy_weights) != len(strategies):
+                    raise ValueError(
+                        f"Number of strategy_weights ({len(strategy_weights)}) "
+                        f"must match number of strategies ({len(strategies)})"
+                    )
+                self.strategy_weights = strategy_weights
+
+        # Validate strategy-specific compatibility and create feature extractors
+        for strategy in self.strategies:
+            self._validate_strategy_compatibility(teacher, student, strategy)
+
+        self._create_multi_feature_extractors()
+
+        # Freeze teacher model
+        self.teacher.trainable = False
+
+        # Initialize loss tracking metrics
+        self.student_loss_tracker = keras.metrics.Mean(name="student_loss")
+        self.distillation_loss_tracker = keras.metrics.Mean(
+            name="distillation_loss"
+        )
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+
+    def _validate_models(self, teacher, student):
+        """Validate that teacher and student models are compatible."""
+        if not isinstance(teacher, keras.Model):
+            raise ValueError(
+                f"Teacher must be a keras.Model, got {type(teacher)}"
+            )
+        if not isinstance(student, keras.Model):
+            raise ValueError(
+                f"Student must be a keras.Model, got {type(student)}"
+            )
+
+        self._validate_input_compatibility(teacher, student)
+        self._validate_output_compatibility(teacher, student)
+        self._validate_dtype_compatibility(teacher, student)
+
+    def _assert_shapes_are_compatible(self, shape1, shape2, context):
+        """Assert that two shapes are compatible."""
+        if len(shape1) != len(shape2):
+            raise ValueError(
+                f"Teacher and student {context} shapes have different "
+                f"dimensions. Teacher: {shape1}, Student: {shape2}."
+            )
+
+        for dim1, dim2 in zip(shape1, shape2):
+            if dim1 is not None and dim2 is not None and dim1 != dim2:
+                raise ValueError(
+                    f"Teacher and student {context} shapes are incompatible. "
+                    f"Teacher: {shape1}, Student: {shape2}. "
+                    f"All dimensions must match."
+                )
+
+    def _assert_same_dtype(self, teacher_dtype, student_dtype, context):
+        """Assert that teacher and student dtypes are the same."""
+        if teacher_dtype != student_dtype:
+            raise ValueError(
+                f"Teacher and student {context} dtypes must match. "
+                f"Teacher: {teacher_dtype}, Student: {student_dtype}."
+            )
+
+    def _validate_input_compatibility(self, teacher, student):
+        """Validate that teacher and student have compatible input shapes."""
+        if not hasattr(teacher, "inputs") or not hasattr(student, "inputs"):
+            return
+        teacher_inputs = getattr(teacher, "inputs")
+        student_inputs = getattr(student, "inputs")
+        if teacher_inputs is None or student_inputs is None:
+            return
+
+        tree.map_structure(
+            lambda ti, si: self._assert_shapes_are_compatible(
+                ti.shape, si.shape, "input"
+            ),
+            teacher_inputs,
+            student_inputs,
+        )
+
+    def _validate_output_compatibility(self, teacher, student):
+        """Validate that teacher and student have compatible output shapes."""
+        if not hasattr(teacher, "outputs") or not hasattr(student, "outputs"):
+            return
+        teacher_outputs = getattr(teacher, "outputs")
+        student_outputs = getattr(student, "outputs")
+        if teacher_outputs is None or student_outputs is None:
+            return
+
+        tree.map_structure(
+            lambda to, so: self._assert_shapes_are_compatible(
+                to.shape, so.shape, "output"
+            ),
+            teacher_outputs,
+            student_outputs,
+        )
+
+    def _validate_dtype_compatibility(self, teacher, student):
+        """Validate that teacher and student have compatible data types."""
+        if not hasattr(teacher, "inputs") or not hasattr(student, "inputs"):
+            return
+        if teacher.inputs is None or student.inputs is None:
+            return
+
+        tree.map_structure(
+            lambda ti, si: self._assert_same_dtype(ti.dtype, si.dtype, "input"),
+            teacher.inputs,
+            student.inputs,
+        )
+
+        if not hasattr(teacher, "outputs") or not hasattr(student, "outputs"):
+            return
+        if teacher.outputs is None or student.outputs is None:
+            return
+
+        tree.map_structure(
+            lambda to, so: self._assert_same_dtype(
+                to.dtype, so.dtype, "output"
+            ),
+            teacher.outputs,
+            student.outputs,
+        )
+
+    def _validate_strategy_compatibility(self, teacher, student, strategy):
+        """Validate that the strategy is compatible with the teacher and student
+        models."""
+        strategy.validate_model_compatibility(teacher, student)
+
+    def _create_multi_feature_extractors(self):
+        """Create feature extractors for efficient multi-layer extraction."""
+        teacher_layer_names = []
+        student_layer_names = []
+
+        for strategy in self.strategies:
+            if (
+                hasattr(strategy, "teacher_layer_name")
+                and strategy.teacher_layer_name
+            ):
+                if strategy.teacher_layer_name not in teacher_layer_names:
+                    teacher_layer_names.append(strategy.teacher_layer_name)
+            if (
+                hasattr(strategy, "student_layer_name")
+                and strategy.student_layer_name
+            ):
+                if strategy.student_layer_name not in student_layer_names:
+                    student_layer_names.append(strategy.student_layer_name)
+
+        self._teacher_feature_extractor = self._create_feature_extractor(
+            self.teacher, teacher_layer_names
+        )
+        self._student_feature_extractor = self._create_feature_extractor(
+            self.student, student_layer_names
+        )
+
+    def _create_feature_extractor(self, model, layer_names):
+        """Create a feature extractor for a model.
+
+        Args:
+            model: The model to create an extractor for.
+            layer_names: List of layer names to extract features from.
+
+        Returns:
+            Feature extractor model or None if no layer names provided.
+
+        Raises:
+            ValueError: If model has no symbolic inputs/outputs.
+        """
+        if not layer_names:
+            return None
+
+        if not hasattr(model, "inputs") or model.inputs is None:
+            raise ValueError(
+                f"Cannot create feature extractor for {model.name}. "
+                f"The model has no symbolic inputs attribute."
+            )
+
+        if isinstance(model, keras.Sequential):
+            final_output = model.layers[-1].output
+        else:
+            final_output = model.output
+
+        outputs = {"final_output": final_output}
+        for layer_name in layer_names:
+            layer = model.get_layer(name=layer_name)
+            outputs[layer_name] = layer.output
+
+        return keras.Model(
+            inputs=model.inputs,
+            outputs=outputs,
+            name=f"{model.name}_multi_feature_extractor",
+        )
+
+    def _extract_all_teacher_features(self, x):
+        """Extract all teacher features in a single forward pass."""
+        if self._teacher_feature_extractor is not None:
+            return self._teacher_feature_extractor(x, training=False)
+        else:
+            return {"final_output": self.teacher(x, training=False)}
+
+    def _extract_all_student_features(self, x, y_pred):
+        """Extract all student features in a single forward pass."""
+        if self._student_feature_extractor is not None:
+            return self._student_feature_extractor(x, training=True)
+        else:
+            return {"final_output": y_pred}
+
+    def _get_strategy_features(self, strategy, all_features, is_teacher):
+        """Get the specific features needed by a strategy."""
+        if is_teacher:
+            layer_name = strategy.teacher_layer_name or "final_output"
+        else:
+            layer_name = strategy.student_layer_name or "final_output"
+
+        if layer_name not in all_features:
+            raise ValueError(
+                f"Layer '{layer_name}' not found in extracted features. "
+                f"Available: {list(all_features.keys())}"
+            )
+
+        return all_features[layer_name]
+
+    def compile(self, optimizer="adam", loss=None, metrics=None, **kwargs):
+        """Compile the distiller with proper integration.
+
+        Args:
+            optimizer: Optimizer for training the student model.
+            loss: Student loss function for the student's supervised learning.
+                Can be a string identifier or a loss function instance.
+            metrics: Additional metrics to track during training.
+            **kwargs: Additional arguments passed to parent compile.
+        """
+        if loss is None:
+            raise ValueError("'loss' cannot be None.")
+
+        self._student_loss = tree.map_structure(_convert_loss_to_function, loss)
+        self._student_loss_for_serialization = loss
+
+        if metrics is not None and not isinstance(metrics, (list, tuple)):
+            raise ValueError(
+                f"metrics must be a list or tuple, got {type(metrics)}"
+            )
+
+        super().compile(
+            optimizer=optimizer,
+            loss=None,
+            metrics=metrics,
+            **kwargs,
+        )
+
+    def call(self, inputs, training=None, **kwargs):
+        """Forward pass returns student predictions."""
+        return self.student(inputs, training=training, **kwargs)
+
+    def compute_loss(
+        self, x=None, y=None, y_pred=None, sample_weight=None, training=True
+    ):
+        """Compute combined distillation loss.
+
+        Args:
+            x: Input data.
+            y: Target data.
+            y_pred: Model predictions.
+            sample_weight: Sample weights (currently unused).
+            training: Whether the model is in training mode.
+
+        Returns:
+            Combined loss tensor.
+        """
+        # Handle case where y_pred is not provided
+        if y_pred is None:
+            y_pred = self(x, training=training)
+        # Compute student loss using tree operations for dicts, manual for lists
+        student_loss = 0.0
+        if self.student_loss_weight > 0.0 and y is not None:
+            # Use tree.map_structure for cleaner loss computation
+            try:
+                loss_values = tree.map_structure(
+                    lambda l, o, o_pred: l(o, o_pred),
+                    self._student_loss,
+                    y,
+                    y_pred,
+                )
+                flat_losses = tree.flatten(loss_values)
+                student_loss = (
+                    keras.ops.sum(keras.ops.stack(flat_losses))
+                    if len(flat_losses) > 1
+                    else flat_losses[0]
+                )
+            except (ValueError, TypeError):
+                # Fallback for TrackedDict compatibility issues
+                if isinstance(self._student_loss, dict):
+                    loss_values = {
+                        key: self._student_loss[key](y[key], y_pred[key])
+                        for key in self._student_loss.keys()
+                    }
+                    flat_losses = tree.flatten(loss_values)
+                    student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
+                elif isinstance(self._student_loss, (list, tuple)):
+                    loss_values = [
+                        loss_fn(y_true, y_pred_i)
+                        for loss_fn, y_true, y_pred_i in zip(
+                            self._student_loss, y, y_pred
+                        )
+                    ]
+                    flat_losses = tree.flatten(loss_values)
+                    student_loss = keras.ops.sum(keras.ops.stack(flat_losses))
+                else:
+                    # Single output case
+                    student_loss = self._student_loss(y, y_pred)
+
+            # Ensure student_loss is a scalar
+            if hasattr(student_loss, "shape") and len(student_loss.shape) > 0:
+                student_loss = keras.ops.mean(student_loss)
+
+        # Compute distillation loss
+        distillation_loss = 0.0
+        if self.student_loss_weight < 1.0:
+            teacher_features = self._extract_all_teacher_features(x)
+            student_features = self._extract_all_student_features(x, y_pred)
+
+            # Apply strategies using pre-extracted features
+            for strategy, weight in zip(self.strategies, self.strategy_weights):
+                # Get appropriate outputs/features for this strategy
+                if (
+                    hasattr(strategy, "teacher_layer_name")
+                    and strategy.teacher_layer_name is not None
+                ):
+                    # FeatureDistillation with specific layers
+                    try:
+                        strategy_teacher_output = self._get_strategy_features(
+                            strategy, teacher_features, is_teacher=True
+                        )
+                        strategy_student_output = self._get_strategy_features(
+                            strategy, student_features, is_teacher=False
+                        )
+                    except ValueError as e:
+                        # Re-raise with context about which strategy failed
+                        raise RuntimeError(
+                            f"Failed to extract features for "
+                            f"FeatureDistillation targeting teacher layer "
+                            f"'{strategy.teacher_layer_name}' and student "
+                            f"layer '{strategy.student_layer_name}'. "
+                            f"Original error: {e}"
+                        ) from e
+                else:
+                    # LogitsDistillation or FeatureDistillation (final outputs)
+                    strategy_teacher_output = teacher_features["final_output"]
+                    strategy_student_output = y_pred
+
+                # Validate outputs are compatible for this strategy
+                strategy.validate_outputs(
+                    strategy_teacher_output, strategy_student_output
+                )
+
+                # Compute loss for this strategy
+                strategy_loss = strategy.compute_loss(
+                    strategy_teacher_output, strategy_student_output
+                )
+
+                # Validate that strategy returns a scalar
+                if (
+                    hasattr(strategy_loss, "shape")
+                    and len(strategy_loss.shape) > 0
+                ):
+                    raise ValueError(
+                        f"Strategy {strategy.__class__.__name__} returned a "
+                        f"non-scalar loss with shape {strategy_loss.shape}. "
+                        f"The compute_loss method must return a scalar tensor."
+                    )
+
+                # Apply weight and add to total
+                distillation_loss = keras.ops.add(
+                    distillation_loss, keras.ops.multiply(weight, strategy_loss)
+                )
+
+        # Combine losses
+        total_loss = keras.ops.add(
+            keras.ops.multiply(self.student_loss_weight, student_loss),
+            keras.ops.multiply(
+                keras.ops.subtract(1.0, self.student_loss_weight),
+                distillation_loss,
+            ),
+        )
+
+        # Update metrics
+        self.student_loss_tracker.update_state(student_loss)
+        self.distillation_loss_tracker.update_state(distillation_loss)
+        self.total_loss_tracker.update_state(total_loss)
+
+        return total_loss
+
+    def reset_metrics(self):
+        """Reset all metrics."""
+        super().reset_metrics()
+        self.student_loss_tracker.reset_state()
+        self.distillation_loss_tracker.reset_state()
+        self.total_loss_tracker.reset_state()
+
+    def get_config(self):
+        """Get configuration for serialization."""
+        config = super().get_config()
+        config.update(
+            {
+                "teacher": serialization_lib.serialize_keras_object(
+                    self.teacher
+                ),
+                "student": serialization_lib.serialize_keras_object(
+                    self.student
+                ),
+                "strategies": [
+                    serialization_lib.serialize_keras_object(strategy)
+                    for strategy in self.strategies
+                ],
+                "strategy_weights": self.strategy_weights,
+                "student_loss_weight": self.student_loss_weight,
+            }
+        )
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Create instance from configuration."""
+        config = config.copy()
+
+        # Deserialize objects
+        config["teacher"] = serialization_lib.deserialize_keras_object(
+            config["teacher"]
+        )
+        config["student"] = serialization_lib.deserialize_keras_object(
+            config["student"]
+        )
+        config["strategies"] = [
+            serialization_lib.deserialize_keras_object(strategy)
+            for strategy in config["strategies"]
+        ]
+
+        return cls(**config)
diff --git a/keras/src/distillation/distiller_test.py b/keras/src/distillation/distiller_test.py
new file mode 100644
index 00000000000..67025c043aa
--- /dev/null
+++ b/keras/src/distillation/distiller_test.py
@@ -0,0 +1,526 @@
+import json
+import os
+import tempfile
+
+import numpy as np
+import pytest
+
+import keras
+from keras.src.distillation.distillation_loss import LogitsDistillation
+from keras.src.distillation.distiller import Distiller
+from keras.src.testing import TestCase
+
+
+class SimpleTeacher(keras.Model):
+    """Simple teacher model for testing."""
+
+    def __init__(self, vocab_size=10, hidden_dim=32):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
+        self.dense2 = keras.layers.Dense(vocab_size)
+
+    def call(self, inputs, training=None):
+        x = self.dense1(inputs)
+        return self.dense2(x)
+
+
+class SimpleStudent(keras.Model):
+    """Simple student model for testing."""
+
+    def __init__(self, vocab_size=10, hidden_dim=16):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(hidden_dim, activation="relu")
+        self.dense2 = keras.layers.Dense(vocab_size)
+
+    def call(self, inputs, training=None):
+        x = self.dense1(inputs)
+        return self.dense2(x)
+
+
+@pytest.mark.requires_trainable_backend
+class TestDistiller(TestCase):
+    """Essential test cases for the Distiller class."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        super().setUp()
+
+        # Create test data
+        self.x = np.random.random((20, 5)).astype(np.float32)
+        self.y = np.random.randint(0, 10, (20,)).astype(np.int32)
+
+        # Create teacher and student models
+        self.teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        self.student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Build models
+        dummy_input = self.x[:2]
+        self.teacher(dummy_input)
+        self.student(dummy_input)
+
+        # Create distillation strategy
+        self.strategy = LogitsDistillation(temperature=2.0)
+
+        # Create distiller
+        self.distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=self.strategy,
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        self.distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+    def test_distiller_initialization(self):
+        """Test Distiller initialization."""
+        # Check that teacher is frozen
+        self.assertFalse(self.teacher.trainable)
+
+        # Check that student is trainable
+        self.assertTrue(self.student.trainable)
+
+        # Check student_loss_weight
+        self.assertEqual(self.distiller.student_loss_weight, 0.5)
+
+        # Check strategies (should be a list with one strategy)
+        self.assertIsInstance(self.distiller.strategies, list)
+        self.assertEqual(len(self.distiller.strategies), 1)
+        self.assertIsInstance(self.distiller.strategies[0], LogitsDistillation)
+
+        # Check that strategy has the correct temperature
+        self.assertEqual(self.distiller.strategies[0].temperature, 2.0)
+
+        # Check that model is compiled
+        self.assertIsNotNone(self.distiller.optimizer)
+        # Check if the model has been compiled (different backends may handle
+        # this differently)
+        self.assertTrue(
+            hasattr(self.distiller, "_compile_config")
+            or hasattr(self.distiller, "compiled_loss"),
+            "Model should be compiled",
+        )
+
+    def test_distiller_call(self):
+        """Test Distiller call method (inference)."""
+        # Call should return student outputs
+        outputs = self.distiller(self.x)
+
+        # Check output shape
+        expected_shape = (20, 10)  # batch_size, vocab_size
+        self.assertEqual(outputs.shape, expected_shape)
+
+        # Check that outputs are from student, not teacher
+        student_outputs = self.student(self.x)
+        self.assertAllClose(outputs, student_outputs)
+
+    def test_teacher_freezing(self):
+        """Test that teacher is properly frozen."""
+        # Teacher should be frozen
+        self.assertFalse(self.teacher.trainable)
+
+        # Student should be trainable
+        self.assertTrue(self.student.trainable)
+
+        # Create a new teacher that is trainable and verify it gets frozen
+        new_teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        self.assertTrue(new_teacher.trainable)  # Should be trainable initially
+
+        # Create distiller - should freeze the teacher
+        Distiller(
+            teacher=new_teacher,
+            student=self.student,
+            strategies=self.strategy,
+            student_loss_weight=0.5,
+        )
+
+        # Teacher should now be frozen
+        self.assertFalse(new_teacher.trainable)
+
+    def test_model_compatibility_validation(self):
+        """Test model compatibility validation."""
+        # Test with non-Keras objects
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher="not_a_model",
+                student=self.student,
+                strategies=self.strategy,
+            )
+
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher=self.teacher,
+                student="not_a_model",
+                strategies=self.strategy,
+            )
+
+    def test_multi_strategy_functionality(self):
+        """Test multi-strategy functionality."""
+        # Create multiple strategies
+        strategies = [
+            LogitsDistillation(temperature=3.0),
+            LogitsDistillation(temperature=2.0),
+        ]
+        strategy_weights = [0.7, 0.3]
+
+        # Create distiller with multiple strategies
+        distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=strategies,
+            strategy_weights=strategy_weights,
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Test that strategies are stored correctly
+        self.assertEqual(len(distiller.strategies), 2)
+        self.assertEqual(distiller.strategy_weights, [0.7, 0.3])
+
+        # Test training
+        x = np.random.random((10, 5)).astype(np.float32)
+        y = np.random.randint(0, 10, (10,))
+        history = distiller.fit(x, y, epochs=1, verbose=0)
+
+        # Check metrics
+        self.assertIn("total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
+
+    def test_multi_strategy_validation(self):
+        """Test multi-strategy validation."""
+        strategies = [
+            LogitsDistillation(temperature=3.0),
+            LogitsDistillation(temperature=2.0),
+        ]
+
+        # Test that validation passes for valid configurations
+        distiller = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=strategies,
+            student_loss_weight=0.5,
+        )
+
+        self.assertEqual(len(distiller.strategies), 2)
+
+        # Test invalid strategy weights length
+        with self.assertRaises(ValueError):
+            Distiller(
+                teacher=self.teacher,
+                student=self.student,
+                strategies=strategies,
+                strategy_weights=[1.0],  # Wrong length
+                student_loss_weight=0.5,
+            )
+
+    def test_student_loss_weighting(self):
+        """Test student loss weighting functionality."""
+        # Test with student_loss_weight = 0.0 (only distillation loss)
+        distiller_0 = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=self.strategy,
+            student_loss_weight=0.0,
+        )
+
+        # Test with student_loss_weight = 1.0 (only student loss)
+        distiller_1 = Distiller(
+            teacher=self.teacher,
+            student=self.student,
+            strategies=self.strategy,
+            student_loss_weight=1.0,
+        )
+
+        # Compile both distillers
+        distiller_0.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+        distiller_1.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Test that they can be used for training without errors
+        small_x = self.x[:5]
+        small_y = self.y[:5]
+
+        # Both should train without errors
+        history_0 = distiller_0.fit(small_x, small_y, epochs=1, verbose=0)
+        history_1 = distiller_1.fit(small_x, small_y, epochs=1, verbose=0)
+
+        # Check that training completed
+        self.assertIn("total_loss", history_0.history)
+        self.assertIn("total_loss", history_1.history)
+
+    def test_full_training_workflow(self):
+        """Test complete training workflow with model.fit() - MOST IMPORTANT."""
+        # Create larger dataset for training
+        np.random.seed(42)
+        x_train = np.random.random((100, 5)).astype(np.float32)
+        y_train = np.random.randint(0, 10, (100,)).astype(np.int32)
+        x_val = np.random.random((20, 5)).astype(np.float32)
+        y_val = np.random.randint(0, 10, (20,)).astype(np.int32)
+
+        # Create fresh models for training
+        teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Build models to avoid JAX tracer issues
+        dummy_input = x_train[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=self.strategy,
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Train the model
+        history = distiller.fit(
+            x_train,
+            y_train,
+            validation_data=(x_val, y_val),
+            epochs=3,
+            batch_size=16,
+            verbose=0,
+        )
+
+        # Check that training completed
+        self.assertIn("total_loss", history.history)
+        self.assertIn("val_total_loss", history.history)
+        self.assertIn("student_loss", history.history)
+        self.assertIn("distillation_loss", history.history)
+
+        # Check that losses are finite
+        for loss_name in ["total_loss", "student_loss", "distillation_loss"]:
+            losses = history.history[loss_name]
+            self.assertGreater(len(losses), 0)
+            for loss in losses:
+                self.assertTrue(np.isfinite(loss))
+
+        # Check that the model can make predictions
+        predictions = distiller.predict(x_val[:5], verbose=0)
+        self.assertEqual(predictions.shape, (5, 10))  # batch_size, vocab_size
+
+        # Check that student weights have changed (indicating learning)
+        initial_weights = [w.numpy().copy() for w in student.trainable_weights]
+
+        # Train a bit more
+        distiller.fit(x_train[:10], y_train[:10], epochs=1, verbose=0)
+
+        final_weights = [w.numpy() for w in student.trainable_weights]
+
+        # At least some weights should have changed
+        weights_changed = any(
+            not np.allclose(initial, final, atol=1e-6)
+            for initial, final in zip(initial_weights, final_weights)
+        )
+        self.assertTrue(
+            weights_changed, "Student weights should change during training"
+        )
+
+    def test_evaluation_workflow(self):
+        """Test evaluation workflow with model.evaluate()."""
+        # Create dataset
+        np.random.seed(42)
+        x_test = np.random.random((30, 5)).astype(np.float32)
+        y_test = np.random.randint(0, 10, (30,)).astype(np.int32)
+
+        # Create fresh models
+        teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Build models to avoid JAX tracer issues
+        dummy_input = x_test[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=self.strategy,
+            student_loss_weight=0.5,
+        )
+
+        # Compile distiller
+        distiller.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        # Train briefly
+        distiller.fit(x_test[:10], y_test[:10], epochs=1, verbose=0)
+
+        # Evaluate the model
+        results = distiller.evaluate(x_test, y_test, verbose=0)
+
+        # Check that evaluation returns expected metrics
+        self.assertIsInstance(results, list)
+        self.assertGreater(len(results), 0)
+
+        # All results should be finite
+        for result in results:
+            self.assertTrue(np.isfinite(result))
+
+    def test_prediction_workflow(self):
+        """Test prediction workflow with model.predict()."""
+        # Create dataset
+        np.random.seed(42)
+        x_test = np.random.random((20, 5)).astype(np.float32)
+
+        # Create fresh models
+        teacher = SimpleTeacher(vocab_size=10, hidden_dim=32)
+        student = SimpleStudent(vocab_size=10, hidden_dim=16)
+
+        # Build models to avoid JAX tracer issues
+        dummy_input = x_test[:2]
+        teacher(dummy_input)
+        student(dummy_input)
+
+        # Create distiller
+        distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=self.strategy,
+            student_loss_weight=0.5,
+        )
+
+        # Make predictions
+        predictions = distiller.predict(x_test, verbose=0)
+
+        # Check prediction shape
+        self.assertEqual(predictions.shape, (20, 10))  # batch_size, vocab_size
+
+        # Check that predictions are finite
+        self.assertTrue(np.all(np.isfinite(predictions)))
+
+        # Check predictions sum to reasonable values (not zeros/infinities)
+        prediction_sums = np.sum(predictions, axis=1)
+        self.assertTrue(np.all(np.isfinite(prediction_sums)))
+
+    def test_distiller_serialization_and_saving(self):
+        """Test Distiller serialization, saving, and loading."""
+
+        # Use standard Sequential models for serialization testing
+        teacher = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, activation="relu", name="teacher_dense_1"
+                ),
+                keras.layers.Dense(
+                    16, activation="relu", name="teacher_dense_2"
+                ),
+                keras.layers.Dense(10, name="teacher_output"),
+            ]
+        )
+
+        student = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    16, activation="relu", name="student_dense_1"
+                ),
+                keras.layers.Dense(
+                    8, activation="relu", name="student_dense_2"
+                ),
+                keras.layers.Dense(10, name="student_output"),
+            ]
+        )
+
+        # Create distiller with single strategy
+        strategy = LogitsDistillation(temperature=3.0, loss="kl_divergence")
+
+        original_distiller = Distiller(
+            teacher=teacher,
+            student=student,
+            strategies=strategy,
+            student_loss_weight=0.7,
+        )
+
+        # Build the models by calling them
+        x_test = np.random.random((2, 20)).astype(np.float32)
+        _ = original_distiller(x_test)
+
+        # Test get_config
+        config = original_distiller.get_config()
+
+        # Verify all components are in config
+        required_keys = [
+            "teacher",
+            "student",
+            "strategies",
+            "strategy_weights",
+            "student_loss_weight",
+        ]
+        for key in required_keys:
+            self.assertIn(key, config, f"Missing key: {key}")
+
+        # Test JSON serialization
+        json_str = json.dumps(config)
+        self.assertIsInstance(json_str, str)
+
+        # Test from_config reconstruction
+        reconstructed_distiller = Distiller.from_config(config)
+
+        # Verify reconstruction
+        self.assertEqual(reconstructed_distiller.student_loss_weight, 0.7)
+        self.assertIsInstance(
+            reconstructed_distiller.strategies[0], LogitsDistillation
+        )
+
+        # Verify strategy parameters
+        self.assertEqual(reconstructed_distiller.strategies[0].temperature, 3.0)
+
+        # Test that reconstructed distiller can be used for inference
+        reconstructed_output = reconstructed_distiller(x_test)
+        self.assertEqual(reconstructed_output.shape, (2, 10))
+
+        # Test model saving and loading (full integration test)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            model_path = os.path.join(temp_dir, "distiller_model.keras")
+
+            # Compile original distiller
+            original_distiller.compile(
+                loss="sparse_categorical_crossentropy",
+            )
+
+            # Save the model
+            original_distiller.save(model_path)
+
+            # Load the model
+            loaded_distiller = keras.models.load_model(model_path)
+
+            # Verify loaded model works
+            loaded_output = loaded_distiller(x_test)
+            self.assertEqual(loaded_output.shape, (2, 10))
+
+            # Verify parameters are preserved
+            self.assertEqual(loaded_distiller.student_loss_weight, 0.7)
+
+        # The core serialization functionality is working
+        self.assertTrue(True, "Distiller serialization test passed")