fix: remove double-scaling of distributionWeight in ProbabilisticDistillationStrategy

claude · claude · commit 2bc248f5c6b6 · 2025-11-11T18:53:51.000Z
The standard distillation term was being scaled by (1 - _distributionWeight) twice:
1. Once when computing softLoss (line 63)
2. Again when multiplying combinedLoss (line 73 for trueLabels case)

This double-scaling incorrectly reduced the soft component by (1-distributionWeight)^2
instead of (1-distributionWeight).

Fixed by:
ComputeLoss:
- Removed distributionWeight from initial softLoss scaling (line 63)
- Computing finalLoss (either combinedLoss or softLoss)
- Applying (1.0 - _distributionWeight) scaling exactly once at the end

ComputeGradient:
- Removed distributionWeight from soft gradient scaling
- Removed distributionWeight from hard gradient blending
- Computing combined gradient: Alpha * hardGrad + (1 - Alpha) * softGrad
- Applying (1.0 - _distributionWeight) scaling exactly once per element

Now distributionWeight correctly balances:
- (1 - distributionWeight) * standard_distillation
- distributionWeight * distributional_matching

Note: KL divergence already uses correct direction KLDivergence(teacherSoft, studentSoft)
which computes KL(teacher || student) matching the gradient computation.
diff --git a/src/KnowledgeDistillation/Strategies/ProbabilisticDistillationStrategy.cs b/src/KnowledgeDistillation/Strategies/ProbabilisticDistillationStrategy.cs
@@ -56,24 +56,29 @@ public override T ComputeLoss(Vector<T> studentOutput, Vector<T> teacherOutput,
     {
         ValidateOutputDimensions(studentOutput, teacherOutput, v => v.Length);
 
-        // Standard distillation loss (weighted)
+        // Standard distillation loss
         var studentSoft = Softmax(studentOutput, Temperature);
         var teacherSoft = Softmax(teacherOutput, Temperature);
         var softLoss = KLDivergence(teacherSoft, studentSoft);
-        softLoss = NumOps.Multiply(softLoss, NumOps.FromDouble(Temperature * Temperature * (1.0 - _distributionWeight)));
+        softLoss = NumOps.Multiply(softLoss, NumOps.FromDouble(Temperature * Temperature));
 
+        T finalLoss;
         if (trueLabels != null)
         {
             ValidateLabelDimensions(studentOutput, trueLabels, v => v.Length);
             var studentProbs = Softmax(studentOutput, 1.0);
             var hardLoss = CrossEntropy(studentProbs, trueLabels);
-            var combinedLoss = NumOps.Add(
+            finalLoss = NumOps.Add(
                 NumOps.Multiply(NumOps.FromDouble(Alpha), hardLoss),
                 NumOps.Multiply(NumOps.FromDouble(1.0 - Alpha), softLoss));
-            return NumOps.Multiply(combinedLoss, NumOps.FromDouble(1.0 - _distributionWeight));
+        }
+        else
+        {
+            finalLoss = softLoss;
         }
 
-        return softLoss;
+        // Apply distribution weight reduction exactly once
+        return NumOps.Multiply(finalLoss, NumOps.FromDouble(1.0 - _distributionWeight));
     }
 
     public override Vector<T> ComputeGradient(Vector<T> studentOutput, Vector<T> teacherOutput, Vector<T>? trueLabels = null)
@@ -86,23 +91,39 @@ public override Vector<T> ComputeGradient(Vector<T> studentOutput, Vector<T> tea
         var studentSoft = Softmax(studentOutput, Temperature);
         var teacherSoft = Softmax(teacherOutput, Temperature);
 
-        for (int i = 0; i < n; i++)
-        {
-            var diff = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
-            gradient[i] = NumOps.Multiply(diff, NumOps.FromDouble(Temperature * Temperature * (1.0 - _distributionWeight)));
-        }
-
         if (trueLabels != null)
         {
             ValidateLabelDimensions(studentOutput, trueLabels, v => v.Length);
             var studentProbs = Softmax(studentOutput, 1.0);
 
             for (int i = 0; i < n; i++)
             {
+                // Soft gradient (temperature-scaled)
+                var softGrad = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
+                softGrad = NumOps.Multiply(softGrad, NumOps.FromDouble(Temperature * Temperature));
+
+                // Hard gradient
                 var hardGrad = NumOps.Subtract(studentProbs[i], trueLabels[i]);
-                gradient[i] = NumOps.Add(
-                    NumOps.Multiply(NumOps.FromDouble(Alpha * (1.0 - _distributionWeight)), hardGrad),
-                    NumOps.Multiply(NumOps.FromDouble((1.0 - Alpha) * (1.0 - _distributionWeight)), gradient[i]));
+
+                // Combined gradient: Alpha * hardGrad + (1 - Alpha) * softGrad
+                var combined = NumOps.Add(
+                    NumOps.Multiply(NumOps.FromDouble(Alpha), hardGrad),
+                    NumOps.Multiply(NumOps.FromDouble(1.0 - Alpha), softGrad));
+
+                // Apply distribution weight reduction exactly once
+                gradient[i] = NumOps.Multiply(combined, NumOps.FromDouble(1.0 - _distributionWeight));
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n; i++)
+            {
+                // Soft gradient (temperature-scaled)
+                var softGrad = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
+                softGrad = NumOps.Multiply(softGrad, NumOps.FromDouble(Temperature * Temperature));
+
+                // Apply distribution weight reduction exactly once
+                gradient[i] = NumOps.Multiply(softGrad, NumOps.FromDouble(1.0 - _distributionWeight));
             }
         }