mlstm start

Artur-Galstyan · Artur-Galstyan · commit 36717cecb301 · 2026-03-03T18:31:27.000+01:00
diff --git a/noxton/nn/__init__.py b/noxton/nn/__init__.py
@@ -3,7 +3,12 @@
 from .convolution import ConvNormActivation
 from .embedding import EmbeddingBag, EmbeddingWithPadding
 from .mamba import Mamba, MambaBlock, SelectiveStateSpaceModel
-from .normalization import BatchNorm, LayerNorm, LocalResponseNormalization
+from .normalization import (
+    BatchNorm,
+    LayerNorm,
+    LocalResponseNormalization,
+    ResidualLayerNorm,
+)
 from .regularization import StochasticDepth
 from .sequential import BatchedLinear
 from .state_space import SelectiveStateSpace
@@ -15,6 +20,7 @@
     TransformerEncoderLayer,
     VisionTransformer,
 )
+from .xlstm import mLSTMCell
 
 __all__ = [
     "AbstractNorm",
@@ -26,6 +32,7 @@
     "EmbeddingWithPadding",
     "BatchNorm",
     "LayerNorm",
+    "ResidualLayerNorm",
     "LocalResponseNormalization",
     "StochasticDepth",
     "BatchedLinear",
@@ -39,4 +46,5 @@
     "Mamba",
     "SelectiveStateSpaceModel",
     "MambaBlock",
+    "mLSTMCell",
 ]
diff --git a/noxton/nn/normalization.py b/noxton/nn/normalization.py
@@ -2,6 +2,7 @@
 import jax
 import jax.numpy as jnp
 from beartype.typing import Any, Hashable, Sequence
+from equinox import Module, field
 from equinox.nn import State
 from jaxtyping import Array, Float, PRNGKeyArray
 
@@ -73,8 +74,8 @@ def __init__(
         eps: float = 1e-5,
         momentum: float = 0.1,
         affine: bool = True,
-        inference: bool = False,
         dtype: Any | None = None,
+        inference: bool = False,
     ):
         if dtype is None:
             dtype = default_floating_dtype()
@@ -364,3 +365,100 @@ def __call__(self, x: Array, *_, key: PRNGKeyArray | None = None, **__) -> Array
 
         out = out.astype(orig_dtype)
         return out
+
+
+class ResidualLayerNorm(Module):
+    """Layer normalisation with a residual scale parameter.
+
+    Normalises the input by subtracting the mean and dividing by the standard
+    deviation computed over the entire array. The learnable affine scale
+    parameter is formulated as a residual ``1 + weight``, where ``weight`` is
+    initialised to zero.
+
+    Unlike ``LayerNorm``, this module expects the input to exactly match the
+    configured ``shape`` and does not automatically broadcast over leading
+    batch dimensions; use ``jax.vmap`` for batched inputs.
+
+    Computation is performed at a higher precision (at least ``float32``) and
+    the result is cast back to the original dtype.
+
+    Args:
+        shape: The exact shape of the unbatched input array. Pass a single
+            ``int`` for the common 1-D case.
+        eps: Small constant added to the variance for numerical stability.
+            Defaults to ``1e-5``.
+        use_weight: If ``True``, learn a per-element residual scale parameter
+            initialised to ``0``. Defaults to ``True``.
+        use_bias: If ``True``, learn a per-element bias parameter initialised
+            to ``0``. Defaults to ``False``.
+        dtype: Floating-point dtype for the affine parameters. Defaults to
+            ``None``.
+
+    Raises:
+        ValueError: If the input shape does not exactly match ``shape``.
+
+    Example:
+        >>> import jax
+        >>> import jax.numpy as jnp
+        >>> rln = ResidualLayerNorm(shape=64)
+        >>> x = jnp.ones((10, 64))
+        >>> jax.vmap(rln)(x).shape
+        (10, 64)
+    """
+
+    shape: tuple[int, ...] = field(static=True)
+    eps: float = field(static=True)
+    use_weight: bool = field(static=True)
+    use_bias: bool = field(static=True)
+    weight: Float[Array, "*shape"] | None
+    bias: Float[Array, "*shape"] | None
+
+    def __init__(
+        self,
+        shape: int | Sequence[int],
+        eps: float = 1e-5,
+        use_weight: bool = True,
+        use_bias: bool = False,
+        dtype=None,
+    ):
+        if isinstance(shape, int):
+            shape = (shape,)
+        else:
+            shape = tuple(shape)
+        self.shape = shape
+        self.eps = eps
+        self.use_weight = use_weight
+        self.use_bias = use_bias
+        self.weight = jnp.zeros(shape, dtype=dtype) if use_weight else None
+        self.bias = jnp.zeros(shape, dtype=dtype) if use_bias else None
+
+    def __call__(
+        self,
+        x: Float[Array, "*shape"],
+        *,
+        key: PRNGKeyArray | None = None,
+    ) -> Array:
+        if x.shape != self.shape:
+            raise ValueError(
+                f"Expected shape {self.shape}, got {x.shape}. You might need jax.vmap."
+            )
+
+        orig_dtype = x.dtype
+        with jax.numpy_dtype_promotion("standard"):
+            dtype = jnp.result_type(x.dtype, jnp.float32)
+
+        x = x.astype(dtype)
+        mean = jnp.mean(x, keepdims=True)
+        variance = jnp.var(x, keepdims=True)
+        variance = jnp.maximum(0.0, variance)
+        inv = jax.lax.rsqrt(variance + self.eps)
+        out = (x - mean) * inv
+
+        if self.use_weight:
+            assert self.weight is not None
+            out = (1.0 + self.weight.astype(dtype)) * out
+        if self.use_bias:
+            assert self.bias is not None
+            out = out + self.bias.astype(dtype)
+
+        return out.astype(orig_dtype)
diff --git a/noxton/nn/xlstm.py b/noxton/nn/xlstm.py
@@ -1,8 +1,76 @@
 import equinox as eqx
+import jax
+import jax.numpy as jnp
+from beartype.typing import Any
+from jaxtyping import Array, Float, PRNGKeyArray
+
+from noxton.nn import ResidualLayerNorm
 
 
 class mLSTMCell(eqx.Module):
-    pass
+    embedding_dim: int
+    num_heads: int
+
+    igate: eqx.nn.Linear
+    fgate: eqx.nn.Linear
+
+    outnorm: ResidualLayerNorm
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        key: PRNGKeyArray,
+        dtype: Any | None = None,
+    ) -> None:
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        key, ikey, fkey = jax.random.split(key, 3)
+
+        igate = eqx.nn.Linear(3 * embedding_dim, num_heads, key=ikey, dtype=dtype)
+        igate = eqx.tree_at(
+            lambda l: l.weight, igate, jnp.zeros_like(igate.weight, dtype=dtype)
+        )
+        self.igate = eqx.tree_at(
+            lambda l: l.bias,
+            igate,
+            jnp.linspace(start=3.0, stop=6.0, num=len(igate.bias), dtype=dtype),
+        )
+
+        fgate = eqx.nn.Linear(3 * embedding_dim, num_heads, key=fkey, dtype=dtype)
+        fgate = eqx.tree_at(
+            lambda l: l.weight, fgate, jnp.zeros_like(fgate.weight, dtype=dtype)
+        )
+        key, subkey = jax.random.split(key)
+        self.fgate = eqx.tree_at(
+            lambda l: l.bias,
+            fgate,
+            jnp.sqrt(0.1) * jax.random.normal(key=subkey, shape=fgate.bias.shape),
+        )
+
+        self.outnorm = ResidualLayerNorm(embedding_dim, use_bias=False, dtype=dtype)
+
+    def __call__(
+        self,
+        q: Float[Array, "seq_len embed_dim"],
+        k: Float[Array, "seq_len embed_dim"],
+        v: Float[Array, "seq_len embed_dim"],
+    ):
+        seq_len, _ = q.shape
+        if_gate_input = jnp.concatenate((q, k, v), axis=1)
+        head_dim = self.embedding_dim // self.num_heads
+        q = jnp.reshape(q, shape=(seq_len, self.num_heads, head_dim)).transpose(1, 0, 2)
+        k = jnp.reshape(k, shape=(seq_len, self.num_heads, head_dim)).transpose(1, 0, 2)
+        v = jnp.reshape(v, shape=(seq_len, self.num_heads, head_dim)).transpose(1, 0, 2)
+
+        igate_preact = self.igate(if_gate_input)
+        igate_preact = jnp.expand_dims(igate_preact.T, axis=-1)
+
+        fgate_preact = self.fgate(if_gate_input)
+        fgate_preact = jnp.expand_dims(fgate_preact.T, axis=-1)
+
+        print(f"{igate_preact.shape=}")
+        print(f"{fgate_preact.shape=}")
 
 
 class mLSTMLayer(eqx.Module):
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,3 +27,6 @@ dev = [
     "torch>=2.10.0",
     "torchvision>=0.25.0",
 ]
+
+[tool.ruff.lint]
+ignore = ["E741", "F722"]

Original file line number	Diff line number	Diff line change
`@@ -27,3 +27,6 @@ dev = [`
`27`	`27`	`"torch>=2.10.0",`
`28`	`28`	`"torchvision>=0.25.0",`
`29`	`29`	`]`
	`30`	`+`
	`31`	`+[tool.ruff.lint]`
	`32`	`+ignore = ["E741", "F722"]`