@@ -8,8 +8,8 @@ namespace AiDotNet.Optimizers;
88/// Modified Gradient Descent optimizer for Hope architecture.
99/// Based on Equations 27-29 from "Nested Learning" paper.
1010///
11- /// Traditional GD: W( t+1) = W(t) - eta * gradient(L(W(t); x(t))) outer-product x(t)
12- /// Modified GD: W( t+1) = W(t) * (I - x(t)*x(t) ^T) - eta * gradient(L(W(t); x(t))) outer-product x(t)
11+ /// Traditional GD: W_{ t+1} = W_t - η * ∇L(W_t; x_t) ⊗ x_t
12+ /// Modified GD: W_{ t+1} = W_t * (I - x_t*x_t ^T) - η * ∇L(W_t; x_t) ⊗ x_t
1313///
1414/// This formulation uses L2 regression objective instead of dot-product similarity,
1515/// resulting in better handling of data dependencies in token space.
@@ -23,7 +23,7 @@ public class ModifiedGradientDescentOptimizer<T>
2323 /// <summary>
2424 /// Creates a modified gradient descent optimizer.
2525 /// </summary>
26- /// <param name="learningRate">Learning rate (eta) </param>
26+ /// <param name="learningRate">Learning rate η </param>
2727 public ModifiedGradientDescentOptimizer ( T learningRate )
2828 {
2929 _learningRate = learningRate ;
@@ -32,33 +32,33 @@ public ModifiedGradientDescentOptimizer(T learningRate)
3232 /// <summary>
3333 /// Updates parameters using modified gradient descent (Equations 27-29).
3434 ///
35- /// min_W ||W*x(t) - gradient_y(L(W(t); x(t)))||^2
35+ /// min_W ||W*x_t - ∇_y L(W_t; x_t)||²
3636 ///
3737 /// Results in:
38- /// W( t+1) = W(t) * (I - x(t)*x(t) ^T) - eta * gradient_y(L(W(t); x(t))) outer-product x(t)
38+ /// W_{ t+1} = W_t * (I - x_t*x_t ^T) - η * ∇_y L(W_t; x_t) ⊗ x_t
3939 /// </summary>
40- /// <param name="currentParameters">Current parameter matrix W(t) </param>
41- /// <param name="input">Input vector x(t) </param>
42- /// <param name="outputGradient">Gradient gradient_y(L(W(t); x(t)) )</param>
43- /// <returns>Updated parameters W( t+1) </returns>
40+ /// <param name="currentParameters">Current parameter matrix W_t </param>
41+ /// <param name="input">Input vector x_t </param>
42+ /// <param name="outputGradient">Gradient ∇_y L(W_t; x_t )</param>
43+ /// <returns>Updated parameters W_{ t+1} </returns>
4444 public Matrix < T > UpdateMatrix ( Matrix < T > currentParameters , Vector < T > input , Vector < T > outputGradient )
4545 {
4646 int rows = currentParameters . Rows ;
4747 int cols = currentParameters . Columns ;
4848
49- // Compute (I - x(t)*x(t) ^T)
49+ // Compute (I - x_t*x_t ^T)
5050 var identityMinusOuterProduct = ComputeIdentityMinusOuterProduct ( input ) ;
5151
52- // Compute W(t) * (I - x(t)*x(t) ^T)
52+ // Compute W_t * (I - x_t*x_t ^T)
5353 var firstTerm = currentParameters . Multiply ( identityMinusOuterProduct ) ;
5454
55- // Compute gradient_y(L(W(t); x(t))) outer-product x(t )
55+ // Compute ∇_y L(W_t; x_t) ⊗ x_t (outer product )
5656 var gradientUpdate = ComputeOuterProduct ( outputGradient , input ) ;
5757
58- // Scale by learning rate: eta * (gradient_y(L) outer-product x(t) )
58+ // Scale by learning rate: η * (∇_y L ⊗ x_t )
5959 var scaledGradient = gradientUpdate . Multiply ( _learningRate ) ;
6060
61- // Final update: W( t+1) = W(t) * (I - x(t)*x(t) ^T) - eta * (gradient_y(L) outer-product x(t) )
61+ // Final update: W_{ t+1} = W_t * (I - x_t*x_t ^T) - η * (∇_y L ⊗ x_t )
6262 var updated = firstTerm . Subtract ( scaledGradient ) ;
6363
6464 return updated ;
@@ -91,11 +91,11 @@ public Vector<T> UpdateVector(Vector<T> currentParameters, Vector<T> input, Vect
9191 // Apply modified update rule
9292 for ( int i = 0 ; i < currentParameters . Length ; i ++ )
9393 {
94- // Standard GD component: -eta * gradient
94+ // Standard GD component: -η * gradient
9595 T gradComponent = _numOps . Multiply ( outputGradient [ i ] , _learningRate ) ;
9696
97- // Modification: scale by (1 - ||x(t)||^2 ) factor for regularization
98- // CRITICAL: Clip to prevent negative scaling when ||x(t)||^2 > 1
97+ // Modification: scale by (1 - ||x_t||² ) factor for regularization
98+ // CRITICAL: Clip to prevent negative scaling when ||x_t||² > 1
9999 // Without clipping, parameters would explode when input norm exceeds 1
100100 T modFactor = _numOps . Subtract ( _numOps . One , inputNormSquared ) ;
101101 if ( _numOps . LessThan ( modFactor , _numOps . Zero ) )
@@ -112,7 +112,7 @@ public Vector<T> UpdateVector(Vector<T> currentParameters, Vector<T> input, Vect
112112 }
113113
114114 /// <summary>
115- /// Computes (I - x(t)*x(t) ^T) where x(t) is the input vector.
115+ /// Computes (I - x_t*x_t ^T) where x_t is the input vector.
116116 /// This is the modification term that accounts for data dependencies.
117117 /// </summary>
118118 private Matrix < T > ComputeIdentityMinusOuterProduct ( Vector < T > input )
@@ -126,7 +126,7 @@ private Matrix<T> ComputeIdentityMinusOuterProduct(Vector<T> input)
126126 result [ i , i ] = _numOps . One ;
127127 }
128128
129- // Subtract outer product: x(t)*x(t) ^T
129+ // Subtract outer product: x_t*x_t ^T
130130 for ( int i = 0 ; i < dim ; i ++ )
131131 {
132132 for ( int j = 0 ; j < dim ; j ++ )
@@ -140,7 +140,7 @@ private Matrix<T> ComputeIdentityMinusOuterProduct(Vector<T> input)
140140 }
141141
142142 /// <summary>
143- /// Computes outer product of two vectors: a outer-product b = a*b^T
143+ /// Computes outer product of two vectors: a ⊗ b = a*b^T
144144 /// </summary>
145145 private Matrix < T > ComputeOuterProduct ( Vector < T > a , Vector < T > b )
146146 {
0 commit comments