Update adam optimizer.

ameritusweb · ameritusweb · commit 5aeb4c004160 · 2023-07-07T23:25:52.000-04:00
diff --git a/src/RMAD/AdamOptimizer.cs b/src/RMAD/AdamOptimizer.cs
@@ -6,7 +6,7 @@
 namespace ParallelReverseAutoDiff.RMAD
 {
     using System;
-    using System.Collections.Generic;
+    using System.Diagnostics;
     using System.Threading.Tasks;
 
     /// <summary>
@@ -93,26 +93,47 @@ public void Optimize(IModelLayer[] layers)
         private void UpdateWeightWithAdam(Matrix w, Matrix mW, Matrix vW, Matrix gradient, double beta1, double beta2, double epsilon)
         {
             // Update biased first moment estimate
-            mW = MatrixUtils.MatrixAdd(MatrixUtils.ScalarMultiply(beta1, mW), MatrixUtils.ScalarMultiply(1 - beta1, gradient));
+            var firstMoment = MatrixUtils.MatrixAdd(MatrixUtils.ScalarMultiply(beta1, mW), MatrixUtils.ScalarMultiply(1 - beta1, gradient));
 
             // Update biased second raw moment estimate
-            vW = MatrixUtils.MatrixAdd(MatrixUtils.ScalarMultiply(beta2, vW), MatrixUtils.ScalarMultiply(1 - beta2, MatrixUtils.HadamardProduct(gradient, gradient)));
+            var secondMoment = MatrixUtils.MatrixAdd(MatrixUtils.ScalarMultiply(beta2, vW), MatrixUtils.ScalarMultiply(1 - beta2, MatrixUtils.HadamardProduct(gradient, gradient)));
 
             // Compute bias-corrected first moment estimate
-            Matrix mW_hat = MatrixUtils.ScalarMultiply(1 / (1 - Math.Pow(beta1, this.network.Parameters.AdamIteration)), mW);
+            Matrix mW_hat = MatrixUtils.ScalarMultiply(1 / (1 - Math.Pow(beta1, this.network.Parameters.AdamIteration)), firstMoment);
 
             // Compute bias-corrected second raw moment estimate
-            Matrix vW_hat = MatrixUtils.ScalarMultiply(1 / (1 - Math.Pow(beta2, this.network.Parameters.AdamIteration)), vW);
+            Matrix vW_hat = MatrixUtils.ScalarMultiply(1 / (1 - Math.Pow(beta2, this.network.Parameters.AdamIteration)), secondMoment);
 
             // Update weights
             for (int i = 0; i < w.Length; i++)
             {
                 for (int j = 0; j < w[0].Length; j++)
                 {
                     double weightReductionValue = this.network.Parameters.LearningRate * mW_hat[i][j] / (Math.Sqrt(vW_hat[i][j]) + epsilon);
+#if DEBUG
+                    Debug.WriteLine(weightReductionValue + " vs gradient: " + gradient[i][j]);
+#endif
                     w[i][j] -= weightReductionValue;
                 }
             }
+
+            // Update first moment
+            for (int i = 0; i < mW.Length; i++)
+            {
+                for (int j = 0; j < mW[0].Length; j++)
+                {
+                    mW[i][j] = firstMoment[i][j];
+                }
+            }
+
+            // Update second moment
+            for (int i = 0; i < vW.Length; i++)
+            {
+                for (int j = 0; j < vW[0].Length; j++)
+                {
+                    vW[i][j] = secondMoment[i][j];
+                }
+            }
         }
     }
 }
diff --git a/test/ParallelReverseAutoDiff.Test/GraphAttentionPaths/GraphAttentionPathsNeuralNetwork.cs b/test/ParallelReverseAutoDiff.Test/GraphAttentionPaths/GraphAttentionPathsNeuralNetwork.cs
@@ -164,6 +164,19 @@ public void ApplyWeights()
             }
         }
 
+        /// <summary>
+        /// Apply the gradients to update the weights.
+        /// </summary>
+        public void ApplyGradients()
+        {
+            var clipper = this.readoutNeuralNetwork.Utilities.GradientClipper;
+            clipper.Clip(this.modelLayers.ToArray());
+            var adamOptimizer = this.readoutNeuralNetwork.Utilities.AdamOptimizer;
+            adamOptimizer.Optimize(this.modelLayers.ToArray());
+            GradientClearer clearer = new GradientClearer();
+            clearer.Clear(this.modelLayers.ToArray());
+        }
+
         /// <summary>
         /// Make a forward pass through the computation graph.
         /// </summary>
diff --git a/test/ParallelReverseAutoDiff.Test/GraphAttentionPathsNeuralNetworkTest.cs b/test/ParallelReverseAutoDiff.Test/GraphAttentionPathsNeuralNetworkTest.cs
@@ -24,10 +24,11 @@ public async Task GivenGraphAttentionPathsNeuralNetworkMiniBatch_ProcessesMiniBa
 
                 int batchSize = 4;
 
-                GraphAttentionPathsNeuralNetwork neuralNetwork = new GraphAttentionPathsNeuralNetwork(graphs, batchSize, 16, 115, 5, 2, 4, 0.001d, 4d);
+                GraphAttentionPathsNeuralNetwork neuralNetwork = new GraphAttentionPathsNeuralNetwork(graphs, batchSize, 16, 115, 3, 2, 4, 0.001d, 4d);
                 await neuralNetwork.Initialize();
                 DeepMatrix gradientOfLoss = neuralNetwork.Forward();
                 await neuralNetwork.Backward(gradientOfLoss);
+                neuralNetwork.ApplyGradients();
             }
             finally
             {

Original file line number	Diff line number	Diff line change
`@@ -24,10 +24,11 @@ public async Task GivenGraphAttentionPathsNeuralNetworkMiniBatch_ProcessesMiniBa`
`24`	`24`
`25`	`25`	`int batchSize = 4;`
`26`	`26`
`27`		`- GraphAttentionPathsNeuralNetwork neuralNetwork = new GraphAttentionPathsNeuralNetwork(graphs, batchSize, 16, 115, 5, 2, 4, 0.001d, 4d);`
	`27`	`+ GraphAttentionPathsNeuralNetwork neuralNetwork = new GraphAttentionPathsNeuralNetwork(graphs, batchSize, 16, 115, 3, 2, 4, 0.001d, 4d);`
`28`	`28`	`await neuralNetwork.Initialize();`
`29`	`29`	`DeepMatrix gradientOfLoss = neuralNetwork.Forward();`
`30`	`30`	`await neuralNetwork.Backward(gradientOfLoss);`
	`31`	`+ neuralNetwork.ApplyGradients();`
`31`	`32`	`}`
`32`	`33`	`finally`
`33`	`34`	`{`