Skip to content

Commit 74753e7

Browse files
committed
Revert encoding changes - Unicode math symbols are correct
The Unicode mathematical symbols (η, ∇, ⊗, ²) are correct and display properly in UTF-8. They should NOT be replaced with ASCII. Reverted previous incorrect changes that removed: - η (eta) - ∇ (nabla/gradient) - ⊗ (tensor product/outer product) - ² (superscript 2) - Subscript notation like W_t, x_t These Unicode characters are standard in mathematical documentation and work perfectly fine in C# XML comments.
1 parent cf20dc4 commit 74753e7

File tree

1 file changed

+20
-20
lines changed

1 file changed

+20
-20
lines changed

src/Optimizers/ModifiedGradientDescentOptimizer.cs

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ namespace AiDotNet.Optimizers;
88
/// Modified Gradient Descent optimizer for Hope architecture.
99
/// Based on Equations 27-29 from "Nested Learning" paper.
1010
///
11-
/// Traditional GD: W(t+1) = W(t) - eta * gradient(L(W(t); x(t))) outer-product x(t)
12-
/// Modified GD: W(t+1) = W(t) * (I - x(t)*x(t)^T) - eta * gradient(L(W(t); x(t))) outer-product x(t)
11+
/// Traditional GD: W_{t+1} = W_t - η * ∇L(W_t; x_t) ⊗ x_t
12+
/// Modified GD: W_{t+1} = W_t * (I - x_t*x_t^T) - η * ∇L(W_t; x_t) ⊗ x_t
1313
///
1414
/// This formulation uses L2 regression objective instead of dot-product similarity,
1515
/// resulting in better handling of data dependencies in token space.
@@ -23,7 +23,7 @@ public class ModifiedGradientDescentOptimizer<T>
2323
/// <summary>
2424
/// Creates a modified gradient descent optimizer.
2525
/// </summary>
26-
/// <param name="learningRate">Learning rate (eta)</param>
26+
/// <param name="learningRate">Learning rate η</param>
2727
public ModifiedGradientDescentOptimizer(T learningRate)
2828
{
2929
_learningRate = learningRate;
@@ -32,33 +32,33 @@ public ModifiedGradientDescentOptimizer(T learningRate)
3232
/// <summary>
3333
/// Updates parameters using modified gradient descent (Equations 27-29).
3434
///
35-
/// min_W ||W*x(t) - gradient_y(L(W(t); x(t)))||^2
35+
/// min_W ||W*x_t - ∇_y L(W_t; x_t)||²
3636
///
3737
/// Results in:
38-
/// W(t+1) = W(t) * (I - x(t)*x(t)^T) - eta * gradient_y(L(W(t); x(t))) outer-product x(t)
38+
/// W_{t+1} = W_t * (I - x_t*x_t^T) - η * ∇_y L(W_t; x_t) ⊗ x_t
3939
/// </summary>
40-
/// <param name="currentParameters">Current parameter matrix W(t)</param>
41-
/// <param name="input">Input vector x(t)</param>
42-
/// <param name="outputGradient">Gradient gradient_y(L(W(t); x(t)))</param>
43-
/// <returns>Updated parameters W(t+1)</returns>
40+
/// <param name="currentParameters">Current parameter matrix W_t</param>
41+
/// <param name="input">Input vector x_t</param>
42+
/// <param name="outputGradient">Gradient ∇_y L(W_t; x_t)</param>
43+
/// <returns>Updated parameters W_{t+1}</returns>
4444
public Matrix<T> UpdateMatrix(Matrix<T> currentParameters, Vector<T> input, Vector<T> outputGradient)
4545
{
4646
int rows = currentParameters.Rows;
4747
int cols = currentParameters.Columns;
4848

49-
// Compute (I - x(t)*x(t)^T)
49+
// Compute (I - x_t*x_t^T)
5050
var identityMinusOuterProduct = ComputeIdentityMinusOuterProduct(input);
5151

52-
// Compute W(t) * (I - x(t)*x(t)^T)
52+
// Compute W_t * (I - x_t*x_t^T)
5353
var firstTerm = currentParameters.Multiply(identityMinusOuterProduct);
5454

55-
// Compute gradient_y(L(W(t); x(t))) outer-product x(t)
55+
// Compute ∇_y L(W_t; x_t) ⊗ x_t (outer product)
5656
var gradientUpdate = ComputeOuterProduct(outputGradient, input);
5757

58-
// Scale by learning rate: eta * (gradient_y(L) outer-product x(t))
58+
// Scale by learning rate: η * (∇_y L ⊗ x_t)
5959
var scaledGradient = gradientUpdate.Multiply(_learningRate);
6060

61-
// Final update: W(t+1) = W(t) * (I - x(t)*x(t)^T) - eta * (gradient_y(L) outer-product x(t))
61+
// Final update: W_{t+1} = W_t * (I - x_t*x_t^T) - η * (∇_y L ⊗ x_t)
6262
var updated = firstTerm.Subtract(scaledGradient);
6363

6464
return updated;
@@ -91,11 +91,11 @@ public Vector<T> UpdateVector(Vector<T> currentParameters, Vector<T> input, Vect
9191
// Apply modified update rule
9292
for (int i = 0; i < currentParameters.Length; i++)
9393
{
94-
// Standard GD component: -eta * gradient
94+
// Standard GD component: -η * gradient
9595
T gradComponent = _numOps.Multiply(outputGradient[i], _learningRate);
9696

97-
// Modification: scale by (1 - ||x(t)||^2) factor for regularization
98-
// CRITICAL: Clip to prevent negative scaling when ||x(t)||^2 > 1
97+
// Modification: scale by (1 - ||x_t||²) factor for regularization
98+
// CRITICAL: Clip to prevent negative scaling when ||x_t||² > 1
9999
// Without clipping, parameters would explode when input norm exceeds 1
100100
T modFactor = _numOps.Subtract(_numOps.One, inputNormSquared);
101101
if (_numOps.LessThan(modFactor, _numOps.Zero))
@@ -112,7 +112,7 @@ public Vector<T> UpdateVector(Vector<T> currentParameters, Vector<T> input, Vect
112112
}
113113

114114
/// <summary>
115-
/// Computes (I - x(t)*x(t)^T) where x(t) is the input vector.
115+
/// Computes (I - x_t*x_t^T) where x_t is the input vector.
116116
/// This is the modification term that accounts for data dependencies.
117117
/// </summary>
118118
private Matrix<T> ComputeIdentityMinusOuterProduct(Vector<T> input)
@@ -126,7 +126,7 @@ private Matrix<T> ComputeIdentityMinusOuterProduct(Vector<T> input)
126126
result[i, i] = _numOps.One;
127127
}
128128

129-
// Subtract outer product: x(t)*x(t)^T
129+
// Subtract outer product: x_t*x_t^T
130130
for (int i = 0; i < dim; i++)
131131
{
132132
for (int j = 0; j < dim; j++)
@@ -140,7 +140,7 @@ private Matrix<T> ComputeIdentityMinusOuterProduct(Vector<T> input)
140140
}
141141

142142
/// <summary>
143-
/// Computes outer product of two vectors: a outer-product b = a*b^T
143+
/// Computes outer product of two vectors: a b = a*b^T
144144
/// </summary>
145145
private Matrix<T> ComputeOuterProduct(Vector<T> a, Vector<T> b)
146146
{

0 commit comments

Comments
 (0)