ooples
diff --git a/‎src/Enums/MatrixDecompositionType.cs‎
Lines changed: 3 additions & 3 deletions b/‎src/Enums/MatrixDecompositionType.cs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/Enums/OptimizerType.cs‎
Lines changed: 37 additions & 18 deletions b/‎src/Enums/OptimizerType.cs‎
Lines changed: 37 additions & 18 deletions
diff --git a/‎src/Factories/MatrixDecompositionFactory.cs‎
Lines changed: 2 additions & 2 deletions b/‎src/Factories/MatrixDecompositionFactory.cs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Interfaces/IAssociativeMemory.cs‎
Lines changed: 38 additions & 0 deletions b/‎src/Interfaces/IAssociativeMemory.cs‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/Interfaces/IContextFlow.cs‎
Lines changed: 48 additions & 0 deletions b/‎src/Interfaces/IContextFlow.cs‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/Models/VectorModel.cs‎
Lines changed: 3 additions & 3 deletions b/‎src/Models/VectorModel.cs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/NestedLearning/AssociativeMemory.cs‎
Lines changed: 153 additions & 0 deletions b/‎src/NestedLearning/AssociativeMemory.cs‎
Lines changed: 153 additions & 0 deletions
@@ -10,7 +10,7 @@ namespace AiDotNet.Enums;
 /// (grids of numbers) into simpler components to solve problems more efficiently.
 /// 
 /// Think of it as:
-/// - Breaking down a complex number like 15 into its factors 3 � 5
+/// - Breaking down a complex number like 15 into its factors 3 × 5
 /// - Disassembling a complicated machine into its basic parts
 /// - Converting a difficult problem into several easier ones
 /// 
@@ -353,7 +353,7 @@ public enum MatrixDecompositionType
     Bidiagonal,
 
     /// <summary>
-    /// Decomposes a symmetric matrix into the product U�D�U?, where U is upper triangular with 1s on the diagonal and D is diagonal.
+    /// Decomposes a symmetric matrix into the product U·D·Uᵀ, where U is upper triangular with 1s on the diagonal and D is diagonal.
     /// </summary>
     /// <remarks>
     /// <para>
@@ -377,7 +377,7 @@ public enum MatrixDecompositionType
     Udu,
 
     /// <summary>
-    /// Decomposes a symmetric matrix into the product L�D�L?, where L is lower triangular with 1s on the diagonal and D is diagonal.
+    /// Decomposes a symmetric matrix into the product L·D·Lᵀ, where L is lower triangular with 1s on the diagonal and D is diagonal.
     /// </summary>
     /// <remarks>
     /// <para>
 
@@ -146,8 +146,8 @@ public enum OptimizerType
     /// </summary>
     /// <remarks>
     /// <para>
-    /// <b>For Beginners:</b> Adagrad adjusts the learning rate for each parameter based on how frequently 
-    /// it's been updated. Imagine having different step sizes for different terrains – taking smaller steps 
+    /// <b>For Beginners:</b> Adagrad adjusts the learning rate for each parameter based on how frequently
+    /// it's been updated. Imagine having different step sizes for different terrains - taking smaller steps 
     /// on well-explored paths and larger steps in new areas. This works well for sparse data (where many 
     /// features are rarely seen) but can cause the learning rate to become too small over time as it 
     /// continuously shrinks, eventually making learning too slow.
@@ -176,9 +176,9 @@ public enum OptimizerType
     /// <para>
     /// <b>For Beginners:</b> Adadelta is another solution to Adagrad's diminishing learning rates, but it 
     /// goes a step further than RMSprop. It not only tracks a moving average of past squared gradients but 
-    /// also maintains a moving average of past parameter updates. This allows it to continue learning even 
-    /// when the gradients become very small. Adadelta is unique because it doesn't even require setting an 
-    /// initial learning rate – it's like a hiker who can naturally adjust their pace based on both the 
+    /// also maintains a moving average of past parameter updates. This allows it to continue learning even
+    /// when the gradients become very small. Adadelta is unique because it doesn't even require setting an
+    /// initial learning rate - it's like a hiker who can naturally adjust their pace based on both the 
     /// terrain and their own recent energy expenditure.
     /// </para>
     /// </remarks>
@@ -204,9 +204,9 @@ public enum OptimizerType
     /// <remarks>
     /// <para>
     /// <b>For Beginners:</b> Nadam (Nesterov-accelerated Adam) combines the benefits of Adam with those of 
-    /// Nesterov momentum. It takes Adam's ability to adapt learning rates individually for each parameter 
-    /// and adds Nesterov's "look-ahead" approach. This gives you both adaptive step sizes and better 
-    /// directional awareness – like a hiker who not only adjusts their stride based on the terrain but 
+    /// Nesterov momentum. It takes Adam's ability to adapt learning rates individually for each parameter
+    /// and adds Nesterov's "look-ahead" approach. This gives you both adaptive step sizes and better
+    /// directional awareness - like a hiker who not only adjusts their stride based on the terrain but
     /// also scouts ahead before committing to a direction.
     /// </para>
     /// </remarks>
@@ -218,9 +218,9 @@ public enum OptimizerType
     /// <remarks>
     /// <para>
     /// <b>For Beginners:</b> AdamW improves on Adam by handling weight decay (a technique to prevent overfitting) 
-    /// in a more effective way. Regular Adam applies weight decay to the already-adapted gradients, which can 
-    /// make it less effective. AdamW applies weight decay directly to the weights instead. This seemingly small 
-    /// change leads to better generalization – like making sure your backpack stays light throughout your journey, 
+    /// in a more effective way. Regular Adam applies weight decay to the already-adapted gradients, which can
+    /// make it less effective. AdamW applies weight decay directly to the weights instead. This seemingly small
+    /// change leads to better generalization - like making sure your backpack stays light throughout your journey,
     /// rather than only thinking about its weight when deciding how fast to walk. This helps the model perform 
     /// better on new, unseen examples.
     /// </para>
@@ -246,7 +246,7 @@ public enum OptimizerType
     /// </summary>
     /// <remarks>
     /// <para>
-    /// <b>For Beginners:</b> LBFGS (Limited-memory Broyden–Fletcher–Goldfarb–Shanno) is an advanced optimizer 
+    /// <b>For Beginners:</b> LBFGS (Limited-memory Broyden-Fletcher-Goldfarb-Shanno) is an advanced optimizer 
     /// that uses information about the curvature of the error surface (not just the slope). While first-order 
     /// methods like SGD only know which way is downhill, LBFGS also has an idea of how quickly the slope is 
     /// changing in different directions. This is like having not just a compass but also a detailed topographic 
@@ -549,14 +549,33 @@ public enum OptimizerType
     /// </summary>
     /// <remarks>
     /// <para>
-    /// <b>For Beginners:</b> AdaDelta is an advanced optimizer that improves upon AdaGrad by addressing its 
-    /// diminishing learning rates problem. Instead of accumulating all past squared gradients, AdaDelta uses 
-    /// a moving window of gradients, remembering only recent history. What makes AdaDelta special is that it 
-    /// doesn't even require setting an initial learning rate - it adapts automatically based on the relationship 
-    /// between parameter updates and gradients. It's like a hiker who adjusts their pace not just based on the 
-    /// steepness of the terrain, but also on how efficiently they've been covering ground recently. This makes 
+    /// <b>For Beginners:</b> AdaDelta is an advanced optimizer that improves upon AdaGrad by addressing its
+    /// diminishing learning rates problem. Instead of accumulating all past squared gradients, AdaDelta uses
+    /// a moving window of gradients, remembering only recent history. What makes AdaDelta special is that it
+    /// doesn't even require setting an initial learning rate - it adapts automatically based on the relationship
+    /// between parameter updates and gradients. It's like a hiker who adjusts their pace not just based on the
+    /// steepness of the terrain, but also on how efficiently they've been covering ground recently. This makes
     /// AdaDelta particularly robust across different types of problems without requiring manual tuning.
     /// </para>
     /// </remarks>
     AdaDelta,
+
+    /// <summary>
+    /// Nested Learning optimizer - a multi-level optimization paradigm for continual learning.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// <b>For Beginners:</b> Nested Learning is a new paradigm from Google Research that treats ML models as
+    /// interconnected, multi-level learning problems optimized simultaneously. Unlike traditional optimizers
+    /// that update all parameters at the same rate, Nested Learning operates at multiple timescales - some
+    /// parameters update quickly (learning from immediate feedback) while others update slowly (learning
+    /// general patterns). It uses a Continuum Memory System (CMS) that maintains memories at different
+    /// frequencies, mimicking how the human brain has both short-term and long-term memory. This makes it
+    /// particularly good at continual learning - learning new tasks without forgetting old ones. It's like
+    /// having multiple learning strategies working together: one that quickly adapts to new situations,
+    /// another that slowly builds general knowledge, and others in between, all coordinating to prevent
+    /// "catastrophic forgetting" where learning new tasks destroys knowledge of old tasks.
+    /// </para>
+    /// </remarks>
+    NestedLearning,
 }
@@ -5,8 +5,8 @@ namespace AiDotNet.Factories;
 /// </summary>
 /// <remarks>
 /// <para>
-/// <b>For Beginners:</b> Matrix decomposition is a way of breaking down a complex matrix into simpler 
-/// components that are easier to work with mathematically. It's like factoring a number (e.g., 12 = 3 � 4), 
+/// <b>For Beginners:</b> Matrix decomposition is a way of breaking down a complex matrix into simpler
+/// components that are easier to work with mathematically. It's like factoring a number (e.g., 12 = 3 × 4),
 /// but for matrices.
 /// </para>
 /// <para>
 
@@ -0,0 +1,38 @@
+using AiDotNet.LinearAlgebra;
+
+namespace AiDotNet.Interfaces;
+
+/// <summary>
+/// Interface for Associative Memory modules used in nested learning.
+/// Models both backpropagation and attention mechanisms as associative memory.
+/// </summary>
+/// <typeparam name="T">The numeric type</typeparam>
+public interface IAssociativeMemory<T>
+{
+    /// <summary>
+    /// Associates an input with a target output (learns the mapping).
+    /// In backpropagation context: maps data point to local error.
+    /// In attention context: maps queries to key-value pairs.
+    /// </summary>
+    void Associate(Vector<T> input, Vector<T> target);
+
+    /// <summary>
+    /// Retrieves the associated output for a given input query.
+    /// </summary>
+    Vector<T> Retrieve(Vector<T> query);
+
+    /// <summary>
+    /// Updates the memory based on new associations.
+    /// </summary>
+    void Update(Vector<T> input, Vector<T> target, T learningRate);
+
+    /// <summary>
+    /// Gets the memory capacity.
+    /// </summary>
+    int Capacity { get; }
+
+    /// <summary>
+    /// Clears all stored associations.
+    /// </summary>
+    void Clear();
+}
@@ -0,0 +1,48 @@
+using AiDotNet.LinearAlgebra;
+
+namespace AiDotNet.Interfaces;
+
+/// <summary>
+/// Interface for Context Flow mechanism - maintains distinct information pathways
+/// and update rates for each nested optimization level.
+/// Core component of nested learning paradigm.
+/// </summary>
+/// <typeparam name="T">The numeric type</typeparam>
+public interface IContextFlow<T>
+{
+    /// <summary>
+    /// Propagates context through the flow network at a specific optimization level.
+    /// Each level has its own distinct set of information from which it learns.
+    /// </summary>
+    Vector<T> PropagateContext(Vector<T> input, int currentLevel);
+
+    /// <summary>
+    /// Computes gradients with respect to context flow for backpropagation.
+    /// </summary>
+    Vector<T> ComputeContextGradients(Vector<T> upstreamGradient, int level);
+
+    /// <summary>
+    /// Updates the context flow based on multi-level optimization.
+    /// </summary>
+    void UpdateFlow(Vector<T>[] gradients, T[] learningRates);
+
+    /// <summary>
+    /// Gets the current context state for a specific optimization level.
+    /// </summary>
+    Vector<T> GetContextState(int level);
+
+    /// <summary>
+    /// Compresses internal context flows (deep learning compression mechanism).
+    /// </summary>
+    Vector<T> CompressContext(Vector<T> context, int targetLevel);
+
+    /// <summary>
+    /// Resets the context flow to initial state.
+    /// </summary>
+    void Reset();
+
+    /// <summary>
+    /// Gets the number of context flow levels.
+    /// </summary>
+    int NumberOfLevels { get; }
+}
@@ -372,10 +372,10 @@ public void ApplyGradients(Vector<T> gradients, T learningRate)
     /// - Throws an error if the input has the wrong number of features
     /// 
     /// This is the core of how a linear model works - it's just a weighted sum:
-    /// prediction = (input1 � coefficient1) + (input2 � coefficient2) + ...
-    /// 
+    /// prediction = (input1 × coefficient1) + (input2 × coefficient2) + ...
+    ///
     /// For example, with coefficients [50000, 100, 20000] and input [3, 1500, 2],
-    /// the prediction would be: 3�50000 + 1500�100 + 2�20000 = 350,000
+    /// the prediction would be: 3×50000 + 1500×100 + 2×20000 = 350,000
     /// </para>
     /// </remarks>
     public T Evaluate(Vector<T> input)
 
@@ -0,0 +1,153 @@
+using AiDotNet.Helpers;
+using AiDotNet.Interfaces;
+using AiDotNet.LinearAlgebra;
+
+namespace AiDotNet.NestedLearning;
+
+/// <summary>
+/// Implementation of Associative Memory for nested learning.
+/// Models both backpropagation (data point → local error) and
+/// attention mechanisms (query → key-value) as associative memory.
+/// </summary>
+/// <typeparam name="T">The numeric type</typeparam>
+public class AssociativeMemory<T> : IAssociativeMemory<T>
+{
+    private readonly int _capacity;
+    private readonly int _dimension;
+    private readonly List<(Vector<T> Input, Vector<T> Target)> _memories;
+    private Matrix<T> _associationMatrix;
+    private static readonly INumericOperations<T> _numOps = MathHelper.GetNumericOperations<T>();
+
+    public AssociativeMemory(int dimension, int capacity = 1000)
+    {
+        _dimension = dimension;
+        _capacity = capacity;
+        _memories = new List<(Vector<T>, Vector<T>)>();
+        _associationMatrix = new Matrix<T>(dimension, dimension);
+    }
+
+    public void Associate(Vector<T> input, Vector<T> target)
+    {
+        if (input.Length != _dimension || target.Length != _dimension)
+            throw new ArgumentException("Input and target must match memory dimension");
+
+        // Add to memory buffer
+        _memories.Add((input.Clone(), target.Clone()));
+
+        // Maintain capacity limit (FIFO)
+        if (_memories.Count > _capacity)
+        {
+            _memories.RemoveAt(0);
+        }
+
+        // Update association matrix using Hebbian-like learning
+        UpdateAssociationMatrix(input, target, _numOps.FromDouble(0.01));
+    }
+
+    public Vector<T> Retrieve(Vector<T> query)
+    {
+        if (query.Length != _dimension)
+            throw new ArgumentException("Query must match memory dimension");
+
+        // Retrieve using association matrix (similar to attention mechanism)
+        var retrieved = _associationMatrix.Multiply(query);
+
+        // Also check for exact or near matches in memory buffer
+        T bestSimilarity = _numOps.FromDouble(double.NegativeInfinity);
+        Vector<T>? bestMatch = null;
+
+        foreach (var (input, target) in _memories)
+        {
+            T similarity = ComputeSimilarity(query, input);
+            if (_numOps.GreaterThan(similarity, bestSimilarity))
+            {
+                bestSimilarity = similarity;
+                bestMatch = target;
+            }
+        }
+
+        // Blend matrix-based retrieval with buffer-based retrieval
+        if (bestMatch != null && _numOps.GreaterThan(bestSimilarity, _numOps.FromDouble(0.8)))
+        {
+            T blendFactor = _numOps.FromDouble(0.3);
+            var blended = new Vector<T>(_dimension);
+
+            for (int i = 0; i < _dimension; i++)
+            {
+                T matrixPart = _numOps.Multiply(retrieved[i],
+                    _numOps.Subtract(_numOps.One, blendFactor));
+                T bufferPart = _numOps.Multiply(bestMatch[i], blendFactor);
+                blended[i] = _numOps.Add(matrixPart, bufferPart);
+            }
+
+            return blended;
+        }
+
+        return retrieved;
+    }
+
+    public void Update(Vector<T> input, Vector<T> target, T learningRate)
+    {
+        if (input.Length != _dimension || target.Length != _dimension)
+            throw new ArgumentException("Input and target must match memory dimension");
+
+        UpdateAssociationMatrix(input, target, learningRate);
+    }
+
+    private void UpdateAssociationMatrix(Vector<T> input, Vector<T> target, T learningRate)
+    {
+        // Hebbian learning rule: Δw_ij = η * target_i * input_j
+        // This models how backpropagation maps data points to local errors
+        for (int i = 0; i < _dimension; i++)
+        {
+            for (int j = 0; j < _dimension; j++)
+            {
+                T update = _numOps.Multiply(_numOps.Multiply(target[i], input[j]), learningRate);
+                _associationMatrix[i, j] = _numOps.Add(_associationMatrix[i, j], update);
+            }
+        }
+    }
+
+    private T ComputeSimilarity(Vector<T> a, Vector<T> b)
+    {
+        // Cosine similarity
+        T dotProduct = _numOps.Zero;
+        T normA = _numOps.Zero;
+        T normB = _numOps.Zero;
+
+        for (int i = 0; i < _dimension; i++)
+        {
+            dotProduct = _numOps.Add(dotProduct, _numOps.Multiply(a[i], b[i]));
+            normA = _numOps.Add(normA, _numOps.Square(a[i]));
+            normB = _numOps.Add(normB, _numOps.Square(b[i]));
+        }
+
+        normA = _numOps.Sqrt(normA);
+        normB = _numOps.Sqrt(normB);
+
+        T denominator = _numOps.Multiply(normA, normB);
+
+        if (_numOps.Equals(denominator, _numOps.Zero))
+            return _numOps.Zero;
+
+        return _numOps.Divide(dotProduct, denominator);
+    }
+
+    public int Capacity => _capacity;
+
+    public void Clear()
+    {
+        _memories.Clear();
+        _associationMatrix = new Matrix<T>(_dimension, _dimension);
+    }
+
+    /// <summary>
+    /// Gets the association matrix for inspection/debugging.
+    /// </summary>
+    public Matrix<T> GetAssociationMatrix() => _associationMatrix;
+
+    /// <summary>
+    /// Gets the number of stored memories.
+    /// </summary>
+    public int MemoryCount => _memories.Count;
+}