Add transformer components and improve autograd engine

IgorTavcar · claude · IgorTavcar · commit 6423760de18a · 2026-03-05T20:28:57.000+01:00
- Simplify backward pass: replace _backward closures with _local_grads tuples (from karpathy#115) - Zero grads before backward for idempotent backward() calls (from karpathy#102) - Add exp, log, tanh, softmax to Value class - Add transformer components: Linear, Embedding, LayerNorm, Attention, MultiHeadAttention, FeedForward, TransformerBlock, Transformer, cross_entropy - Move single-output unwrapping from Layer to MLP (from karpathy#111) - Add input shape assertion in Neuron (from karpathy#107) - Add MLP test (from karpathy#111) - Expand .gitignore with standard Python patterns Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/.gitignore b/.gitignore
@@ -1 +1,58 @@
 .ipynb_checkpoints/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyPI configuration file
+.pypirc
+
+# VSCode
+.vscode/
diff --git a/micrograd/engine.py b/micrograd/engine.py
@@ -1,56 +1,55 @@
+import math
 
 class Value:
     """ stores a single scalar value and its gradient """
 
-    def __init__(self, data, _children=(), _op=''):
+    def __init__(self, data, _children=(), _op='', _local_grads=()):
         self.data = data
         self.grad = 0
         # internal variables used for autograd graph construction
-        self._backward = lambda: None
-        self._prev = set(_children)
+        self._prev = _children
         self._op = _op # the op that produced this node, for graphviz / debugging / etc
+        self._local_grads = _local_grads # local derivative of this node w.r.t. its children
 
     def __add__(self, other):
         other = other if isinstance(other, Value) else Value(other)
-        out = Value(self.data + other.data, (self, other), '+')
-
-        def _backward():
-            self.grad += out.grad
-            other.grad += out.grad
-        out._backward = _backward
-
+        out = Value(self.data + other.data, (self, other), '+', (1, 1))
         return out
 
     def __mul__(self, other):
         other = other if isinstance(other, Value) else Value(other)
-        out = Value(self.data * other.data, (self, other), '*')
-
-        def _backward():
-            self.grad += other.data * out.grad
-            other.grad += self.data * out.grad
-        out._backward = _backward
-
+        out = Value(self.data * other.data, (self, other), '*', (other.data, self.data))
         return out
 
     def __pow__(self, other):
         assert isinstance(other, (int, float)), "only supporting int/float powers for now"
-        out = Value(self.data**other, (self,), f'**{other}')
-
-        def _backward():
-            self.grad += (other * self.data**(other-1)) * out.grad
-        out._backward = _backward
-
+        out = Value(self.data**other, (self,), f'**{other}', (other * self.data**(other-1),))
         return out
 
     def relu(self):
-        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
+        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU', (float(self.data > 0),))
+        return out
 
-        def _backward():
-            self.grad += (out.data > 0) * out.grad
-        out._backward = _backward
+    def exp(self):
+        x = math.exp(self.data)
+        out = Value(x, (self,), 'exp', (x,))
+        return out
+
+    def log(self):
+        out = Value(math.log(self.data), (self,), 'log', (1.0 / self.data,))
+        return out
 
+    def tanh(self):
+        t = math.tanh(self.data)
+        out = Value(t, (self,), 'tanh', (1 - t**2,))
         return out
 
+    @staticmethod
+    def softmax(logits):
+        counts = [logit.exp() for logit in logits]
+        total = sum(counts)
+        return [count / total for count in counts]
+
     def backward(self):
 
         # topological order all of the children in the graph
@@ -64,10 +63,16 @@ def build_topo(v):
                 topo.append(v)
         build_topo(self)
 
+        # zero the grads of each node prior to accumulating so that calling
+        # L.backward() twice in a row doesn't produce the wrong answer.
+        for v in reversed(topo):
+            v.grad = 0.0
+
         # go one variable at a time and apply the chain rule to get its gradient
         self.grad = 1
         for v in reversed(topo):
-            v._backward()
+            for child, local_grad in zip(v._prev, v._local_grads):
+                child.grad += local_grad * v.grad
 
     def __neg__(self): # -self
         return self * -1
diff --git a/micrograd/nn.py b/micrograd/nn.py
@@ -18,6 +18,7 @@ def __init__(self, nin, nonlin=True):
         self.nonlin = nonlin
 
     def __call__(self, x):
+        assert len(x) == len(self.w), "Shape mismatch between input and given nin value"
         act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
         return act.relu() if self.nonlin else act
 
@@ -34,7 +35,7 @@ def __init__(self, nin, nout, **kwargs):
 
     def __call__(self, x):
         out = [n(x) for n in self.neurons]
-        return out[0] if len(out) == 1 else out
+        return out
 
     def parameters(self):
         return [p for n in self.neurons for p in n.parameters()]
@@ -51,10 +52,191 @@ def __init__(self, nin, nouts):
     def __call__(self, x):
         for layer in self.layers:
             x = layer(x)
-        return x
+        return x[0] if len(x) == 1 else x
 
     def parameters(self):
         return [p for layer in self.layers for p in layer.parameters()]
 
     def __repr__(self):
         return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"
+
+# --- Transformer components ---
+
+class Linear(Module):
+    """Linear projection (no nonlinearity), with optional bias."""
+
+    def __init__(self, nin, nout, bias=True):
+        scale = nin ** -0.5
+        self.w = [[Value(random.uniform(-scale, scale)) for _ in range(nin)] for _ in range(nout)]
+        self.b = [Value(0.0) for _ in range(nout)] if bias else None
+
+    def __call__(self, x):
+        out = [sum(wi * xi for wi, xi in zip(row, x)) for row in self.w]
+        if self.b:
+            out = [oi + bi for oi, bi in zip(out, self.b)]
+        return out
+
+    def parameters(self):
+        params = [v for row in self.w for v in row]
+        if self.b:
+            params += self.b
+        return params
+
+    def __repr__(self):
+        nout, nin = len(self.w), len(self.w[0])
+        return f"Linear({nin}, {nout}, bias={self.b is not None})"
+
+class Embedding(Module):
+    """Lookup table that maps integer indices to dense vectors."""
+
+    def __init__(self, num_embeddings, embedding_dim):
+        self.weight = [[Value(random.gauss(0, 0.02)) for _ in range(embedding_dim)]
+                       for _ in range(num_embeddings)]
+
+    def __call__(self, idx):
+        return self.weight[idx]
+
+    def parameters(self):
+        return [v for row in self.weight for v in row]
+
+    def __repr__(self):
+        return f"Embedding({len(self.weight)}, {len(self.weight[0])})"
+
+class LayerNorm(Module):
+    """Layer normalization over the last dimension."""
+
+    def __init__(self, dim, eps=1e-5):
+        self.gamma = [Value(1.0) for _ in range(dim)]
+        self.beta = [Value(0.0) for _ in range(dim)]
+        self.eps = eps
+
+    def __call__(self, x):
+        mean = sum(x) * (1.0 / len(x))
+        var = sum((xi - mean) ** 2 for xi in x) * (1.0 / len(x))
+        return [(xi - mean) * (var + self.eps) ** -0.5 * g + b
+                for xi, g, b in zip(x, self.gamma, self.beta)]
+
+    def parameters(self):
+        return self.gamma + self.beta
+
+    def __repr__(self):
+        return f"LayerNorm({len(self.gamma)})"
+
+class Attention(Module):
+    """Single-head scaled dot-product attention."""
+
+    def __init__(self, dim, head_dim):
+        self.query = Linear(dim, head_dim, bias=False)
+        self.key = Linear(dim, head_dim, bias=False)
+        self.value = Linear(dim, head_dim, bias=False)
+        self.head_dim = head_dim
+
+    def __call__(self, x, mask=False):
+        # x: list of vectors (seq_len x dim)
+        Q = [self.query(xi) for xi in x]
+        K = [self.key(xi) for xi in x]
+        V = [self.value(xi) for xi in x]
+        scale = self.head_dim ** 0.5
+        out = []
+        for i in range(len(x)):
+            scores = []
+            for j in range(len(x)):
+                if mask and j > i:
+                    scores.append(Value(-1e9))  # causal mask
+                else:
+                    scores.append(sum(qi * ki for qi, ki in zip(Q[i], K[j])) * (1.0 / scale))
+            weights = Value.softmax(scores)
+            out.append([sum(w * V[j][d] for j, w in enumerate(weights))
+                        for d in range(self.head_dim)])
+        return out
+
+    def parameters(self):
+        return self.query.parameters() + self.key.parameters() + self.value.parameters()
+
+class MultiHeadAttention(Module):
+    """Multi-head attention with output projection."""
+
+    def __init__(self, dim, num_heads):
+        assert dim % num_heads == 0
+        head_dim = dim // num_heads
+        self.heads = [Attention(dim, head_dim) for _ in range(num_heads)]
+        self.proj = Linear(dim, dim)
+
+    def __call__(self, x, mask=False):
+        head_outs = [head(x, mask) for head in self.heads]
+        # concatenate heads at each position, then project
+        concat = [[v for ho in head_outs for v in ho[i]] for i in range(len(x))]
+        return [self.proj(ci) for ci in concat]
+
+    def parameters(self):
+        params = [p for h in self.heads for p in h.parameters()]
+        return params + self.proj.parameters()
+
+class FeedForward(Module):
+    """Two-layer feed-forward network with ReLU."""
+
+    def __init__(self, dim, hidden_dim=None):
+        hidden_dim = hidden_dim or 4 * dim
+        self.up = Linear(dim, hidden_dim)
+        self.down = Linear(hidden_dim, dim)
+
+    def __call__(self, x):
+        return self.down([h.relu() for h in self.up(x)])
+
+    def parameters(self):
+        return self.up.parameters() + self.down.parameters()
+
+class TransformerBlock(Module):
+    """Pre-norm transformer block: LN -> Attention -> Residual -> LN -> FFN -> Residual."""
+
+    def __init__(self, dim, num_heads):
+        self.ln1 = LayerNorm(dim)
+        self.attn = MultiHeadAttention(dim, num_heads)
+        self.ln2 = LayerNorm(dim)
+        self.ff = FeedForward(dim)
+
+    def __call__(self, x, mask=False):
+        # attention + residual
+        attn_out = self.attn([self.ln1(xi) for xi in x], mask)
+        x = [[a + b for a, b in zip(xv, av)] for xv, av in zip(x, attn_out)]
+        # feedforward + residual
+        ff_out = [self.ff(self.ln2(xi)) for xi in x]
+        x = [[a + b for a, b in zip(xv, fv)] for xv, fv in zip(x, ff_out)]
+        return x
+
+    def parameters(self):
+        return self.ln1.parameters() + self.attn.parameters() + \
+               self.ln2.parameters() + self.ff.parameters()
+
+class Transformer(Module):
+    """Decoder-only transformer (GPT-style)."""
+
+    def __init__(self, vocab_size, dim, num_heads, num_layers, max_seq_len):
+        self.token_emb = Embedding(vocab_size, dim)
+        self.pos_emb = Embedding(max_seq_len, dim)
+        self.blocks = [TransformerBlock(dim, num_heads) for _ in range(num_layers)]
+        self.ln_f = LayerNorm(dim)
+        self.output = Linear(dim, vocab_size, bias=False)
+
+    def __call__(self, tokens):
+        # tokens: list of integer token ids
+        x = [[t + p for t, p in zip(self.token_emb(tok), self.pos_emb(i))]
+             for i, tok in enumerate(tokens)]
+        for block in self.blocks:
+            x = block(x, mask=True)
+        return [self.output(self.ln_f(xi)) for xi in x]
+
+    def parameters(self):
+        params = self.token_emb.parameters() + self.pos_emb.parameters()
+        for block in self.blocks:
+            params += block.parameters()
+        params += self.ln_f.parameters() + self.output.parameters()
+        return params
+
+    def __repr__(self):
+        return f"Transformer({len(self.parameters())} parameters)"
+
+def cross_entropy(logits, target):
+    """Cross-entropy loss. logits: list of Values, target: integer index."""
+    probs = Value.softmax(logits)
+    return -probs[target].log()
diff --git a/test/test_nn.py b/test/test_nn.py