fix: remove double-scaling of factorWeight in FactorTransferDistillationStrategy

claude · claude · commit 540241624671 · 2025-11-11T18:49:23.000Z
The standard distillation term was being scaled by (1 - _factorWeight) twice:
1. Once when computing softLoss (line 82)
2. Again when multiplying combinedLoss (line 92 for trueLabels case)

This double-scaling incorrectly reduced the soft component by (1-factorWeight)^2
instead of (1-factorWeight).

Fixed by:
- Removing factorWeight from initial softLoss scaling (line 82)
- Computing finalLoss (either combinedLoss or softLoss)
- Applying (1.0 - _factorWeight) scaling exactly once at the end

Same fix applied to ComputeGradient method:
- Removed factorWeight from soft gradient scaling (line 111)
- Removed factorWeight from hard gradient blending (lines 123-124)
- Applied (1.0 - _factorWeight) scaling exactly once at the end

Now factorWeight correctly balances:
- (1 - factorWeight) * standard_distillation
- factorWeight * factor_transfer
diff --git a/src/KnowledgeDistillation/Strategies/FactorTransferDistillationStrategy.cs b/src/KnowledgeDistillation/Strategies/FactorTransferDistillationStrategy.cs
@@ -75,24 +75,29 @@ public override T ComputeLoss(Vector<T> studentOutput, Vector<T> teacherOutput,
     {
         ValidateOutputDimensions(studentOutput, teacherOutput, v => v.Length);
 
-        // Standard distillation loss (weighted)
+        // Standard distillation loss
         var studentSoft = Softmax(studentOutput, Temperature);
         var teacherSoft = Softmax(teacherOutput, Temperature);
         var softLoss = KLDivergence(teacherSoft, studentSoft);
-        softLoss = NumOps.Multiply(softLoss, NumOps.FromDouble(Temperature * Temperature * (1.0 - _factorWeight)));
+        softLoss = NumOps.Multiply(softLoss, NumOps.FromDouble(Temperature * Temperature));
 
+        T finalLoss;
         if (trueLabels != null)
         {
             ValidateLabelDimensions(studentOutput, trueLabels, v => v.Length);
             var studentProbs = Softmax(studentOutput, 1.0);
             var hardLoss = CrossEntropy(studentProbs, trueLabels);
-            var combinedLoss = NumOps.Add(
+            finalLoss = NumOps.Add(
                 NumOps.Multiply(NumOps.FromDouble(Alpha), hardLoss),
                 NumOps.Multiply(NumOps.FromDouble(1.0 - Alpha), softLoss));
-            return NumOps.Multiply(combinedLoss, NumOps.FromDouble(1.0 - _factorWeight));
+        }
+        else
+        {
+            finalLoss = softLoss;
         }
 
-        return softLoss;
+        // Apply factor weight reduction exactly once
+        return NumOps.Multiply(finalLoss, NumOps.FromDouble(1.0 - _factorWeight));
     }
 
     public override Vector<T> ComputeGradient(Vector<T> studentOutput, Vector<T> teacherOutput, Vector<T>? trueLabels = null)
@@ -108,7 +113,7 @@ public override Vector<T> ComputeGradient(Vector<T> studentOutput, Vector<T> tea
         for (int i = 0; i < n; i++)
         {
             var diff = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
-            gradient[i] = NumOps.Multiply(diff, NumOps.FromDouble(Temperature * Temperature * (1.0 - _factorWeight)));
+            gradient[i] = NumOps.Multiply(diff, NumOps.FromDouble(Temperature * Temperature));
         }
 
         if (trueLabels != null)
@@ -120,11 +125,17 @@ public override Vector<T> ComputeGradient(Vector<T> studentOutput, Vector<T> tea
             {
                 var hardGrad = NumOps.Subtract(studentProbs[i], trueLabels[i]);
                 gradient[i] = NumOps.Add(
-                    NumOps.Multiply(NumOps.FromDouble(Alpha * (1.0 - _factorWeight)), hardGrad),
-                    NumOps.Multiply(NumOps.FromDouble((1.0 - Alpha) * (1.0 - _factorWeight)), gradient[i]));
+                    NumOps.Multiply(NumOps.FromDouble(Alpha), hardGrad),
+                    NumOps.Multiply(NumOps.FromDouble(1.0 - Alpha), gradient[i]));
             }
         }
 
+        // Apply factor weight reduction exactly once
+        for (int i = 0; i < n; i++)
+        {
+            gradient[i] = NumOps.Multiply(gradient[i], NumOps.FromDouble(1.0 - _factorWeight));
+        }
+
         return gradient;
     }