refactor: improve gradient computation structure in FactorTransferDistillationStrategy

claude · claude · commit baa793844559 · 2025-11-11T18:50:41.000Z
Refactored ComputeGradient to compute combined gradients first, then apply
(1.0 - _factorWeight) scaling exactly once per element. This eliminates the
separate final loop and makes the logic clearer.

Changes:
- Compute softGrad = temperature-scaled soft difference (without factorWeight)
- Compute hardGrad = studentProbs - trueLabels
- Form combined = Alpha * hardGrad + (1 - Alpha) * softGrad
- Multiply final combined by (1.0 - _factorWeight) before assigning to gradient[i]
- Handles both trueLabels != null and trueLabels == null cases cleanly

This ensures (1.0 - _factorWeight) is applied exactly once per gradient element
in a single assignment, improving clarity and efficiency.
diff --git a/src/KnowledgeDistillation/Strategies/FactorTransferDistillationStrategy.cs b/src/KnowledgeDistillation/Strategies/FactorTransferDistillationStrategy.cs
@@ -110,30 +110,40 @@ public override Vector<T> ComputeGradient(Vector<T> studentOutput, Vector<T> tea
         var studentSoft = Softmax(studentOutput, Temperature);
         var teacherSoft = Softmax(teacherOutput, Temperature);
 
-        for (int i = 0; i < n; i++)
-        {
-            var diff = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
-            gradient[i] = NumOps.Multiply(diff, NumOps.FromDouble(Temperature * Temperature));
-        }
-
         if (trueLabels != null)
         {
             ValidateLabelDimensions(studentOutput, trueLabels, v => v.Length);
             var studentProbs = Softmax(studentOutput, 1.0);
 
             for (int i = 0; i < n; i++)
             {
+                // Soft gradient (temperature-scaled)
+                var softGrad = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
+                softGrad = NumOps.Multiply(softGrad, NumOps.FromDouble(Temperature * Temperature));
+
+                // Hard gradient
                 var hardGrad = NumOps.Subtract(studentProbs[i], trueLabels[i]);
-                gradient[i] = NumOps.Add(
+
+                // Combined gradient: Alpha * hardGrad + (1 - Alpha) * softGrad
+                var combined = NumOps.Add(
                     NumOps.Multiply(NumOps.FromDouble(Alpha), hardGrad),
-                    NumOps.Multiply(NumOps.FromDouble(1.0 - Alpha), gradient[i]));
+                    NumOps.Multiply(NumOps.FromDouble(1.0 - Alpha), softGrad));
+
+                // Apply factor weight reduction exactly once
+                gradient[i] = NumOps.Multiply(combined, NumOps.FromDouble(1.0 - _factorWeight));
             }
         }
-
-        // Apply factor weight reduction exactly once
-        for (int i = 0; i < n; i++)
+        else
         {
-            gradient[i] = NumOps.Multiply(gradient[i], NumOps.FromDouble(1.0 - _factorWeight));
+            for (int i = 0; i < n; i++)
+            {
+                // Soft gradient (temperature-scaled)
+                var softGrad = NumOps.Subtract(studentSoft[i], teacherSoft[i]);
+                softGrad = NumOps.Multiply(softGrad, NumOps.FromDouble(Temperature * Temperature));
+
+                // Apply factor weight reduction exactly once
+                gradient[i] = NumOps.Multiply(softGrad, NumOps.FromDouble(1.0 - _factorWeight));
+            }
         }
 
         return gradient;