feat: implement automatic checkpointing for knowledge distillation

claude · claude · commit 7d2dfa8fc6fd · 2025-11-12T02:20:13.000Z
Integrate automatic checkpointing directly into KnowledgeDistillationTrainerBase,
eliminating the need for manual checkpoint management code.

Changes:
- Add CheckpointConfig property (null = disabled, set = enabled)
- Add Student property for model to checkpoint
- Add CheckpointManager (auto-created when CheckpointConfig is set)
- Add tracking fields for validation metrics and training loss
- Update OnTrainingStart() to initialize CheckpointManager
- Update OnEpochEnd() to auto-save checkpoints with metrics
- Update OnValidationComplete() to track validation metrics
- Update OnTrainingEnd() to auto-load best checkpoint
- Update CHECKPOINTING_GUIDE.md with automatic usage example

Benefits:
- Zero manual checkpoint code required from users
- Configuration-driven (just set CheckpointConfig property)
- Automatic best model selection via validation metrics
- Automatic checkpoint pruning (keeps only best N)
- Curriculum state preservation (if using curriculum strategies)
- Clean, simple API for 99% of use cases
- Manual control still available for advanced scenarios

Usage:
trainer.CheckpointConfig = new DistillationCheckpointConfig {
    SaveEveryEpochs = 5,
    KeepBestN = 3
};
trainer.Student = student as ICheckpointableModel;
trainer.Train(...); // Checkpointing happens automatically!
diff --git a/src/KnowledgeDistillation/CHECKPOINTING_GUIDE.md b/src/KnowledgeDistillation/CHECKPOINTING_GUIDE.md
@@ -60,9 +60,83 @@ DistillationCheckpointManager<T>
 └─ GetBestCheckpoint() (query metadata)
 ```
 
-## Basic Usage
+## Quick Start: Automatic Checkpointing (Recommended)
 
-### Example 1: Simple Student Checkpointing
+The easiest way to enable checkpointing is through automatic checkpointing built into the trainer. Simply configure the checkpoint settings and the trainer handles everything automatically.
+
+### Automatic Checkpointing Example
+
+```csharp
+using AiDotNet.KnowledgeDistillation;
+
+// Create trainer
+var teacher = LoadPretrainedTeacher();
+var student = CreateStudentModel();  // Must implement ICheckpointableModel
+var strategy = new ConfidenceBasedAdaptiveStrategy<double>();
+
+var trainer = new KnowledgeDistillationTrainer<double, Vector<double>, Vector<double>>(
+    teacher,
+    strategy
+);
+
+// Enable automatic checkpointing by setting CheckpointConfig
+trainer.CheckpointConfig = new DistillationCheckpointConfig
+{
+    CheckpointDirectory = "./checkpoints",
+    SaveEveryEpochs = 5,          // Auto-save every 5 epochs
+    KeepBestN = 3,                // Keep only 3 best checkpoints
+    SaveStudent = true,
+    BestMetric = "validation_loss",
+    LowerIsBetter = true
+};
+
+// Set the student model (required for checkpointing)
+trainer.Student = student as ICheckpointableModel;
+
+// Train - checkpointing happens automatically!
+trainer.Train(
+    studentForward: student.Predict,
+    studentBackward: student.ApplyGradient,
+    trainInputs: trainingData,
+    trainLabels: trainingLabels,
+    epochs: 100,
+    batchSize: 32,
+    validationInputs: validationData,
+    validationLabels: validationLabels
+);
+
+// After training completes, the best checkpoint is automatically loaded!
+Console.WriteLine("Training complete. Best checkpoint automatically restored.");
+```
+
+**What happens automatically:**
+1. **OnTrainingStart**: Checkpoint manager is initialized
+2. **OnEpochEnd**: Checkpoints are saved based on your configuration
+3. **OnValidationComplete**: Validation metrics are tracked for best checkpoint selection
+4. **OnTrainingEnd**: Best checkpoint is automatically loaded
+
+**Benefits:**
+- ✅ Zero manual checkpoint management code
+- ✅ Automatic best model selection
+- ✅ Automatic checkpoint pruning (keeps only best N)
+- ✅ Curriculum state preservation (if using curriculum strategies)
+- ✅ Clean, simple API
+
+### Disabling Automatic Checkpointing
+
+```csharp
+// Default: no checkpointing
+trainer.CheckpointConfig = null;  // or simply don't set it
+
+// Training proceeds without checkpointing
+trainer.Train(...);
+```
+
+## Manual Checkpointing (Advanced)
+
+For advanced use cases where you need fine-grained control over checkpoint timing and logic, you can use the `DistillationCheckpointManager` directly.
+
+### Example 1: Simple Student Checkpointing (Manual)
 
 ```csharp
 using AiDotNet.KnowledgeDistillation;
diff --git a/src/KnowledgeDistillation/KnowledgeDistillationTrainerBase.cs b/src/KnowledgeDistillation/KnowledgeDistillationTrainerBase.cs
@@ -56,6 +56,47 @@ public abstract class KnowledgeDistillationTrainerBase<T, TInput, TOutput> : IKn
     /// </summary>
     public IDistillationStrategy<T, TOutput> DistillationStrategy { get; protected set; }
 
+    /// <summary>
+    /// Gets or sets the checkpoint configuration for automatic model saving during training.
+    /// </summary>
+    /// <remarks>
+    /// <para><b>For Beginners:</b> Set this property to enable automatic checkpointing.
+    /// If null (default), no automatic checkpointing occurs. If set, the trainer will automatically:
+    /// - Save checkpoints based on your configuration (e.g., every 5 epochs)
+    /// - Keep only the best N checkpoints to save disk space
+    /// - Load the best checkpoint after training completes</para>
+    ///
+    /// <para><b>Example:</b>
+    /// <code>
+    /// trainer.CheckpointConfig = new DistillationCheckpointConfig
+    /// {
+    ///     SaveEveryEpochs = 5,
+    ///     KeepBestN = 3,
+    ///     BestMetric = "validation_loss"
+    /// };
+    /// </code>
+    /// </para>
+    /// </remarks>
+    public DistillationCheckpointConfig? CheckpointConfig { get; set; }
+
+    /// <summary>
+    /// Gets or sets the student model for checkpointing.
+    /// </summary>
+    /// <remarks>
+    /// <para>Set this if you want automatic checkpointing. The student must implement
+    /// <see cref="ICheckpointableModel"/> for checkpoint saving/loading to work.</para>
+    /// </remarks>
+    public ICheckpointableModel? Student { get; set; }
+
+    /// <summary>
+    /// Gets the checkpoint manager (created automatically when CheckpointConfig is set).
+    /// </summary>
+    protected DistillationCheckpointManager<T>? CheckpointManager { get; private set; }
+
+    private double _lastValidationMetric;
+    private T _lastTrainingLoss;
+    private int _currentEpoch;
+
     /// <summary>
     /// Initializes a new instance of the KnowledgeDistillationTrainerBase class.
     /// </summary>
@@ -66,6 +107,9 @@ public abstract class KnowledgeDistillationTrainerBase<T, TInput, TOutput> : IKn
     /// <para><b>For Beginners:</b> The teacher and strategy are the core components:
     /// - Teacher: Provides the "expert" knowledge to transfer
     /// - Strategy: Defines how to measure and optimize the knowledge transfer</para>
+    ///
+    /// <para><b>Automatic Checkpointing:</b> To enable automatic checkpointing, set the
+    /// <see cref="CheckpointConfig"/> and <see cref="Student"/> properties after construction.</para>
     /// </remarks>
     protected KnowledgeDistillationTrainerBase(
         ITeacherModel<TInput, TOutput> teacher,
@@ -76,6 +120,7 @@ protected KnowledgeDistillationTrainerBase(
         DistillationStrategy = distillationStrategy ?? throw new ArgumentNullException(nameof(distillationStrategy));
         NumOps = MathHelper.GetNumericOperations<T>();
         Random = seed.HasValue ? new Random(seed.Value) : new Random();
+        _lastTrainingLoss = NumOps.Zero;
     }
 
     /// <summary>
@@ -422,10 +467,17 @@ protected int ArgMax(Vector<T> vector)
     /// - Initialize EMA buffers (for self-distillation)
     /// - Setup curriculum schedules
     /// - Allocate temporary buffers</para>
+    ///
+    /// <para><b>Automatic Checkpointing:</b> If <see cref="CheckpointConfig"/> is set, this method
+    /// automatically initializes the checkpoint manager.</para>
     /// </remarks>
     protected virtual void OnTrainingStart(Vector<TInput> trainInputs, Vector<TOutput>? trainLabels)
     {
-        // Default: no-op, derived classes can override
+        // Initialize checkpoint manager if config is provided
+        if (CheckpointConfig != null)
+        {
+            CheckpointManager = new DistillationCheckpointManager<T>(CheckpointConfig);
+        }
     }
 
     /// <summary>
@@ -439,10 +491,29 @@ protected virtual void OnTrainingStart(Vector<TInput> trainInputs, Vector<TOutpu
     /// - Save final checkpoints
     /// - Log final metrics
     /// - Free temporary resources</para>
+    ///
+    /// <para><b>Automatic Checkpointing:</b> If <see cref="CheckpointConfig"/> is set, this method
+    /// automatically loads the best checkpoint (based on validation metrics) after training completes.</para>
     /// </remarks>
     protected virtual void OnTrainingEnd(Vector<TInput> trainInputs, Vector<TOutput>? trainLabels)
     {
-        // Default: no-op, derived classes can override
+        // Load best checkpoint if checkpointing was enabled
+        if (CheckpointManager != null && Student != null)
+        {
+            var bestCheckpoint = CheckpointManager.LoadBestCheckpoint(
+                student: Student,
+                teacher: Teacher as ICheckpointableModel
+            );
+
+            if (bestCheckpoint != null)
+            {
+                Console.WriteLine($"[Checkpointing] Loaded best checkpoint from epoch {bestCheckpoint.Epoch}");
+                if (bestCheckpoint.Metrics.ContainsKey(CheckpointConfig!.BestMetric))
+                {
+                    Console.WriteLine($"[Checkpointing] Best {CheckpointConfig.BestMetric}: {bestCheckpoint.Metrics[CheckpointConfig.BestMetric]:F4}");
+                }
+            }
+        }
     }
 
     /// <summary>
@@ -478,6 +549,9 @@ protected virtual void OnEpochStart(int epoch, Vector<TInput> trainInputs, Vecto
     /// <para><b>IMPORTANT:</b> This base implementation calls Reset() on RelationalDistillationStrategy
     /// to flush partial batches and prevent buffer leakage between epochs. Derived classes should
     /// call base.OnEpochEnd() if they override this method.</para>
+    ///
+    /// <para><b>Automatic Checkpointing:</b> If <see cref="CheckpointConfig"/> is set, this method
+    /// automatically saves checkpoints based on your configuration.</para>
     /// </remarks>
     protected virtual void OnEpochEnd(int epoch, T avgLoss)
     {
@@ -487,6 +561,33 @@ protected virtual void OnEpochEnd(int epoch, T avgLoss)
         {
             relationalStrategy.Reset();
         }
+
+        // Track current state for checkpointing
+        _currentEpoch = epoch;
+        _lastTrainingLoss = avgLoss;
+
+        // Automatic checkpoint saving
+        if (CheckpointManager != null)
+        {
+            var metrics = new Dictionary<string, double>
+            {
+                { "training_loss", Convert.ToDouble(_lastTrainingLoss) }
+            };
+
+            // Include validation metric if available
+            if (_lastValidationMetric > 0)
+            {
+                metrics[CheckpointConfig!.BestMetric] = _lastValidationMetric;
+            }
+
+            CheckpointManager.SaveCheckpointIfNeeded(
+                epoch: epoch,
+                student: Student,
+                teacher: Teacher as ICheckpointableModel,
+                strategy: DistillationStrategy,
+                metrics: metrics
+            );
+        }
     }
 
     /// <summary>
@@ -500,9 +601,13 @@ protected virtual void OnEpochEnd(int epoch, T avgLoss)
     /// - Implement early stopping
     /// - Track best model
     /// - Adjust hyperparameters based on validation performance</para>
+    ///
+    /// <para><b>Automatic Checkpointing:</b> If <see cref="CheckpointConfig"/> is set, this method
+    /// automatically tracks validation metrics for best checkpoint selection.</para>
     /// </remarks>
     protected virtual void OnValidationComplete(int epoch, double accuracy)
     {
-        // Default: no-op, derived classes can override
+        // Track validation metric for checkpointing
+        _lastValidationMetric = accuracy;
     }
 }