mlstm call

Artur-Galstyan · Artur-Galstyan · commit 351db2ef751a · 2026-03-03T20:05:03.000+01:00
diff --git a/main.py b/main.py
@@ -1,18 +1,21 @@
-import equinox as eqx
 import jax
+import jax.numpy as jnp
 
+from noxton.nn import mLSTMCell
 
-class Model(eqx.Module):
-    inference: bool = eqx.field(static=True)
-    lin: eqx.nn.Linear
+max_seq_len = 4
+embed_dim = 16
+num_heads = 8
+seq_len = 4
 
-    def __init__(self):
-        self.inference = False
-        self.lin = eqx.nn.Linear(10, 10, key=jax.random.key(2))
 
+cell = mLSTMCell(embed_dim, num_heads, key=jax.random.key(22), max_seq_len=4)
 
-model = Model()
-print(model)
+q, k, v = (
+    jnp.ones(shape=(seq_len, embed_dim)),
+    jnp.ones(shape=(seq_len, embed_dim)),
+    jnp.ones(shape=(seq_len, embed_dim)),
+)
 
-model = eqx.nn.inference_mode(model)
-print(model)
+
+cell(q, k, v)
diff --git a/noxton/nn/xlstm.py b/noxton/nn/xlstm.py
@@ -7,7 +7,61 @@
 from noxton.nn import ResidualLayerNorm
 
 
+def parallel_stabilized_simple(
+    queries: Float[Array, "num_heads seq_len head_dim"],
+    keys: Float[Array, "num_heads seq_len head_dim"],
+    values: Float[Array, "num_heads seq_len head_dim"],
+    igate_preact: Float[Array, "num_heads seq_len"],
+    fgate_preact: Float[Array, "num_heads seq_len"],
+    lower_triangular_matrix: Float[Array, "seq_len seq_len"] | None = None,
+    stabilize_rowwise: bool = True,
+    eps: float = 1e-6,
+    **kwargs,
+) -> Array:
+    NH, S, DH = queries.shape
+
+    log_fgates = jax.nn.log_sigmoid(fgate_preact)
+    if lower_triangular_matrix is None or lower_triangular_matrix.shape[0] < S:
+        lower_triangular_matrix = jnp.tril(jnp.ones(shape=(S, S), dtype=jnp.bool))
+
+    assert lower_triangular_matrix is not None
+
+    log_fgates_cumsum = jnp.concatenate(
+        (jnp.zeros((NH, 1, 1)), jnp.cumsum(log_fgates, axis=1)), axis=1
+    )
+    rep_log_fgates_cumsum = jnp.tile(log_fgates_cumsum, (1, 1, S + 1))
+
+    _log_fg_matrix = rep_log_fgates_cumsum - rep_log_fgates_cumsum.transpose(0, 2, 1)
+    log_fg_matrix = jnp.where(
+        lower_triangular_matrix, _log_fg_matrix[:, 1:, 1:], -float("inf")
+    )
+    log_D_matrix = log_fg_matrix + igate_preact.transpose(0, 2, 1)
+    # D matrix stabilization
+    if stabilize_rowwise:
+        max_log_D = jnp.max(log_D_matrix, axis=-1, keepdims=True)
+    else:
+        max_log_D = jnp.expand_dims(
+            jnp.max(log_D_matrix.reshape(NH, -1), axis=-1, keepdims=True), axis=-1
+        )
+
+    log_D_matrix_stabilized = log_D_matrix - max_log_D
+    D_matrix = jnp.exp(log_D_matrix_stabilized)
+
+    keys_scaled = keys / jnp.sqrt(DH)
+
+    qk_matrix = queries @ keys_scaled.transpose(0, 2, 1)
+    C_matrix = qk_matrix * D_matrix
+    normalizer = jnp.maximum(
+        jnp.abs(C_matrix.sum(axis=-1, keepdims=True)), jnp.exp(-max_log_D)
+    )
+    C_matrix_normalized = C_matrix / (normalizer + eps)
+    h_tilde_state = C_matrix_normalized @ values
+
+    return h_tilde_state
+
+
 class mLSTMCell(eqx.Module):
+    max_seq_len: int
     embedding_dim: int
     num_heads: int
 
@@ -20,11 +74,13 @@ def __init__(
         self,
         embedding_dim: int,
         num_heads: int,
+        max_seq_len: int,
         key: PRNGKeyArray,
         dtype: Any | None = None,
     ) -> None:
         self.embedding_dim = embedding_dim
         self.num_heads = num_heads
+        self.max_seq_len = max_seq_len
         key, ikey, fkey = jax.random.split(key, 3)
 
         igate = eqx.nn.Linear(3 * embedding_dim, num_heads, key=ikey, dtype=dtype)
@@ -63,14 +119,22 @@ def __call__(
         k = jnp.reshape(k, shape=(seq_len, self.num_heads, head_dim)).transpose(1, 0, 2)
         v = jnp.reshape(v, shape=(seq_len, self.num_heads, head_dim)).transpose(1, 0, 2)
 
-        igate_preact = self.igate(if_gate_input)
+        igate_preact = eqx.filter_vmap(self.igate)(if_gate_input)
         igate_preact = jnp.expand_dims(igate_preact.T, axis=-1)
 
-        fgate_preact = self.fgate(if_gate_input)
+        fgate_preact = eqx.filter_vmap(self.fgate)(if_gate_input)
         fgate_preact = jnp.expand_dims(fgate_preact.T, axis=-1)
 
-        print(f"{igate_preact.shape=}")
-        print(f"{fgate_preact.shape=}")
+        ltr = jnp.tril(
+            jnp.ones(shape=(self.max_seq_len, self.max_seq_len), dtype=jnp.bool)
+        )
+
+        h_state = parallel_stabilized_simple(
+            q, k, v, igate_preact, fgate_preact, lower_triangular_matrix=ltr
+        )
+        h_state = h_state.transpose(1, 0, 2).reshape(seq_len, -1)
+        h_state_norm = eqx.filter_vmap(self.outnorm)(h_state)
+        return h_state_norm
 
 
 class mLSTMLayer(eqx.Module):
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "equinox>=0.13.4",
     "ftfy>=6.3.1",
     "jax>=0.9.0.1",
+    "numpix>=0.9.10",
     "statedict2pytree>=2.0.2",
 ]
 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ dependencies = [`
`8`	`8`	`"equinox>=0.13.4",`
`9`	`9`	`"ftfy>=6.3.1",`
`10`	`10`	`"jax>=0.9.0.1",`
	`11`	`+ "numpix>=0.9.10",`
`11`	`12`	`"statedict2pytree>=2.0.2",`
`12`	`13`	`]`
`13`	`14`