Flax NNX integration (#1990)

Juan Orduz · web-flow · commit f5ae79bc0e6a · 2025-03-10T15:26:14.000-04:00
* This is a combination of 9 commits. init fix fix fix lint rm files rm files rm files rm files rm files * improvement * improvements * add more tests * add mcmc cases * Refactor add example rm test patch feedback 1 feedback 2 feedback 3 improvements fix test hacky way to find batch normalization layers remove unused code simplyfy code modularize tests simplify simplify * support python 3.9 * init example * fix model * cleanup nb * artifacts docs * remove code eager inizialization approach rm unused code 1 rm unused code 2 rm code rm code rm code rm code * rm code * refactor * skip tests * redback 1/n * redback 2/n * lint * skip test * feedback 3/n * partial * feedback states * clean tests part 1 * set priors * better split * rm mutable from signature * fix numpyro_mutable * fix test * feedback part1 * try fix apply function * Revert "try fix apply function" This reverts commit 17db8c1. * Revert "Revert "try fix apply function"" This reverts commit 9a870d6. * rm reduntand line * better test name * are we updating more than needed? * rename and bring back update after model call * add prior component to example * finalize docs * to_dict * stop gradient * finall comments
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,4 +35,4 @@ repos:
       - id: codespell
         stages: [pre-commit, commit-msg]
         args:
-          [--ignore-words-list, "Teh,aas,ans", --check-filenames, --skip, "*.ipynb"]
+          [--ignore-words-list, "Teh,aas,ans,dout", --check-filenames, --skip, "*.ipynb"]
diff --git a/docs/source/_static/img/tutorials/nnx_example.png b/docs/source/_static/img/tutorials/nnx_example.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -39,6 +39,7 @@ NumPyro documentation
    tutorials/censoring
    tutorials/hsgp_example
    tutorials/other_samplers
+   tutorials/nnx_example
 
 .. nbgallery::
    :maxdepth: 1
diff --git a/docs/source/primitives.rst b/docs/source/primitives.rst
@@ -51,6 +51,10 @@ haiku_module
 ------------
 .. autofunction:: numpyro.contrib.module.haiku_module
 
+nnx_module
+----------
+.. autofunction:: numpyro.contrib.module.nnx_module
+
 random_flax_module
 ------------------
 .. autofunction:: numpyro.contrib.module.random_flax_module
@@ -59,6 +63,10 @@ random_haiku_module
 -------------------
 .. autofunction:: numpyro.contrib.module.random_haiku_module
 
+random_nnx_module
+-----------------
+.. autofunction:: numpyro.contrib.module.random_nnx_module
+
 scan
 ----
 .. autofunction:: numpyro.contrib.control_flow.scan
diff --git a/notebooks/source/nnx_example.ipynb b/notebooks/source/nnx_example.ipynb
diff --git a/numpyro/contrib/module.py b/numpyro/contrib/module.py
@@ -19,6 +19,8 @@
     "haiku_module",
     "random_flax_module",
     "random_haiku_module",
+    "nnx_module",
+    "random_nnx_module",
 ]
 
 
@@ -433,3 +435,134 @@ def random_haiku_module(
         _update_params(params, new_params, prior)
     nn_new = partial(nn.func, new_params, *nn.args[1:], **nn.keywords)
     return nn_new
+
+
+def nnx_module(name, nn_module):
+    """
+    Declare a :mod:`~flax.nnx` style neural network inside a
+    model so that its parameters are registered for optimization via
+    :func:`~numpyro.primitives.param` statements.
+
+    Given a flax NNX ``nn_module``, to evaluate the module, we directly call it.
+    In a NumPyro model, the pattern will be::
+
+        # Eager initialization outside the model
+        module = nn_module(...)
+
+        # Inside the model
+        net = nnx_module("net", module)
+        y = net(x)
+
+    :param str name: name of the module to be registered.
+    :param flax.nnx.Module nn_module: a pre-initialized `flax nnx` Module instance.
+    :return: a callable that takes an array as an input and returns
+        the neural network transformed output array.
+    """
+    try:
+        from flax import nnx
+    except ImportError as e:
+        raise ImportError(
+            "Looking like you want to use flax.nnx to declare "
+            "nn modules. This is an experimental feature. "
+            "You need to install the latest version of `flax` to use this feature. "
+            "It can be installed with `pip install git+https://github.com/google/flax.git`."
+        ) from e
+
+    graph_def, eager_params_state, eager_other_state = nnx.split(
+        nn_module, nnx.Param, nnx.Not(nnx.Param)
+    )
+
+    eager_params_state_dict = nnx.to_pure_dict(eager_params_state)
+
+    module_params = None
+    if eager_params_state:
+        module_params = numpyro.param(name + "$params")
+    if module_params is None:
+        module_params = numpyro.param(name + "$params", eager_params_state_dict)
+
+    eager_other_state_dict = nnx.to_pure_dict(eager_other_state)
+
+    mutable_holder = None
+    if eager_other_state_dict:
+        mutable_holder = numpyro_mutable(name + "$state")
+    if mutable_holder is None:
+        mutable_holder = numpyro_mutable(
+            name + "$state", {"state": eager_other_state_dict}
+        )
+
+    def apply_fn(params, *call_args, **call_kwargs):
+        params_state = eager_params_state
+
+        if params:
+            nnx.replace_by_pure_dict(params_state, params)
+
+        mutable_state = eager_other_state
+        if mutable_holder:
+            nnx.replace_by_pure_dict(mutable_state, mutable_holder["state"])
+
+        model = nnx.merge(graph_def, params_state, mutable_state)
+
+        model_call = model(*call_args, **call_kwargs)
+
+        if mutable_holder:
+            _, _, new_mutable_state = nnx.split(model, nnx.Param, nnx.Not(nnx.Param))
+            new_mutable_state = jax.lax.stop_gradient(new_mutable_state)
+            mutable_holder["state"] = nnx.to_pure_dict(new_mutable_state)
+
+        return model_call
+
+    return partial(apply_fn, module_params)
+
+
+def random_nnx_module(
+    name,
+    nn_module,
+    prior,
+):
+    """
+    A primitive to create a random :mod:`~flax.nnx` style neural network
+    which can be used in MCMC samplers. The parameters of the neural network
+    will be sampled from ``prior``.
+
+    :param str name: name of the module to be registered.
+    :param flax.nnx.Module nn_module: a pre-initialized `flax nnx` Module instance.
+    :param prior: a distribution or a dict of distributions or a callable.
+        If it is a distribution, all parameters will be sampled from the same
+        distribution. If it is a dict, it maps parameter names to distributions.
+        If it is a callable, it takes parameter name and parameter shape as
+        inputs and returns a distribution. For example::
+
+            class Linear(nnx.Module):
+                def __init__(self, din, dout, *, rngs):
+                    self.w = nnx.Param(jax.random.uniform(rngs.params(), (din, dout)))
+                    self.b = nnx.Param(jnp.zeros((dout,)))
+
+                def __call__(self, x):
+                    return x @ self.w + self.b
+
+            # Eager initialization
+            linear = Linear(din=4, dout=1, rngs=nnx.Rngs(params=random.PRNGKey(0)))
+            net = random_nnx_module("net", linear, prior={"w": dist.Normal(), "b": dist.Cauchy()})
+
+        Alternatively, we can use a callable. For example the following are equivalent::
+
+            prior=(lambda name, shape: dist.Cauchy() if name.endswith("b") else dist.Normal())
+            prior={"w": dist.Normal(), "b": dist.Cauchy()}
+
+    :return: a callable that takes an array as an input and returns
+        the neural network transformed output array.
+    """
+
+    nn = nnx_module(name, nn_module)
+
+    apply_fn = nn.func
+    params = nn.args[0]
+    other_args = nn.args[1:]
+    keywords = nn.keywords
+
+    new_params = deepcopy(params)
+
+    with numpyro.handlers.scope(prefix=name):
+        _update_params(params, new_params, prior)
+
+    return partial(apply_fn, new_params, *other_args, **keywords)
diff --git a/test/contrib/test_module.py b/test/contrib/test_module.py
@@ -2,13 +2,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from copy import deepcopy
+import sys
 
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
 
 import jax
 from jax import random
+import jax.numpy as jnp
 
 import numpyro
 from numpyro import handlers
@@ -17,8 +19,10 @@
     _update_params,
     flax_module,
     haiku_module,
+    nnx_module,
     random_flax_module,
     random_haiku_module,
+    random_nnx_module,
 )
 import numpyro.distributions as dist
 from numpyro.infer import MCMC, NUTS, SVI, Trace_ELBO
@@ -195,9 +199,9 @@ def test_random_module_mcmc(backend, init, callable_prior):
         kwargs = {}
 
     if callable_prior:
-        prior = (  # noqa: E731
-            lambda name, shape: dist.Cauchy() if name == bias_name else dist.Normal()
-        )
+
+        def prior(name, shape):
+            return dist.Cauchy() if name == bias_name else dist.Normal()
     else:
         prior = {bias_name: dist.Cauchy(), weight_name: dist.Normal()}
 
@@ -311,3 +315,162 @@ def model():
     guide = AutoDelta(model)
     svi = SVI(model, guide, numpyro.optim.Adam(0.01), Trace_ELBO())
     svi.run(random.PRNGKey(100), 10)
+
+
+@pytest.mark.skipif(sys.version_info[:2] == (3, 9), reason="Skipping on Python 3.9")
+def test_nnx_module():
+    from flax import nnx
+
+    X = np.arange(100).astype(np.float32)
+    Y = 2 * X + 2
+
+    class Linear(nnx.Module):
+        def __init__(self, din, dout, *, rngs):
+            self.w = nnx.Param(jax.random.uniform(rngs.params(), (din, dout)))
+            self.bias = nnx.Param(jnp.zeros((dout,)))
+
+        def __call__(self, x):
+            w_val = self.w.value
+            bias_val = self.bias.value
+            return x @ w_val + bias_val
+
+    # Eager initialization of the Linear module outside the model
+    rng_key = random.PRNGKey(1)
+    linear_module = Linear(din=100, dout=100, rngs=nnx.Rngs(params=rng_key))
+
+    # Extract parameters and state for inspection
+    _, params_state = nnx.split(linear_module, nnx.Param)
+    params_dict = nnx.to_pure_dict(params_state)
+
+    # Verify parameters were created correctly
+    assert "w" in params_dict
+    assert "bias" in params_dict
+    assert params_dict["w"].shape == (100, 100)
+    assert params_dict["bias"].shape == (100,)
+
+    # Define a model using eager initialization
+    def nnx_model_eager(x, y):
+        # Use the pre-initialized Linear module
+        nn = nnx_module("nn", linear_module)
+        mean = nn(x)
+        numpyro.sample("y", numpyro.distributions.Normal(mean, 0.1), obs=y)
+
+    with handlers.trace() as nnx_tr, handlers.seed(rng_seed=1):
+        nnx_model_eager(X, Y)
+
+    assert "w" in nnx_tr["nn$params"]["value"]
+    assert "bias" in nnx_tr["nn$params"]["value"]
+    assert nnx_tr["nn$params"]["value"]["w"].shape == (100, 100)
+    assert nnx_tr["nn$params"]["value"]["bias"].shape == (100,)
+
+
+@pytest.mark.skipif(sys.version_info[:2] == (3, 9), reason="Skipping on Python 3.9")
+@pytest.mark.parametrize(
+    argnames="dropout", argvalues=[True, False], ids=["dropout", "no_dropout"]
+)
+@pytest.mark.parametrize(
+    argnames="batchnorm", argvalues=[True, False], ids=["batchnorm", "no_batchnorm"]
+)
+def test_nnx_state_dropout_smoke(dropout, batchnorm):
+    from flax import nnx
+
+    class Net(nnx.Module):
+        def __init__(self, *, rngs):
+            if batchnorm:
+                # Use feature dimension 3 to match the input shape (4, 3)
+                self.bn = nnx.BatchNorm(3, rngs=rngs)
+            if dropout:
+                # Create dropout with deterministic=True to disable dropout
+                self.dropout = nnx.Dropout(rate=0.5, deterministic=True, rngs=rngs)
+
+        def __call__(self, x, *, rngs=None):
+            if dropout:
+                # Use deterministic=True to disable dropout
+                x = self.dropout(x, deterministic=True)
+
+            if batchnorm:
+                x = self.bn(x)
+
+            return x
+
+    # Eager initialization of the Net module outside the model
+    rng_key = random.PRNGKey(0)
+    net_module = Net(rngs=nnx.Rngs(params=rng_key))
+
+    # Extract parameters and state for inspection
+    _, state = nnx.split(net_module)
+
+    def model():
+        # Use the pre-initialized module
+        nn = nnx_module("nn", net_module)
+
+        x = numpyro.sample("x", dist.Normal(0, 1).expand([4, 3]).to_event(2))
+        y = nn(x)
+        numpyro.deterministic("y", y)
+
+    with handlers.trace(model) as tr, handlers.seed(rng_seed=0):
+        model()
+
+    assert set(tr.keys()) == {"nn$params", "nn$state", "x", "y"}
+    assert tr["nn$state"]["type"] == "mutable"
+
+    # test svi
+    guide = AutoDelta(model)
+    svi = SVI(model, guide, numpyro.optim.Adam(0.01), Trace_ELBO())
+    svi.run(random.PRNGKey(100), 10)
+
+
+@pytest.mark.skipif(sys.version_info[:2] == (3, 9), reason="Skipping on Python 3.9")
+@pytest.mark.parametrize("callable_prior", [True, False])
+def test_random_nnx_module_mcmc(callable_prior):
+    from flax import nnx
+
+    class Linear(nnx.Module):
+        def __init__(self, din, dout, *, rngs):
+            self.w = nnx.Param(jax.random.uniform(rngs.params(), (din, dout)))
+            self.b = nnx.Param(jnp.zeros((dout,)))
+
+        def __call__(self, x):
+            w_val = self.w
+            b_val = self.b
+            return x @ w_val + b_val
+
+    N, dim = 3000, 3
+    data = random.normal(random.PRNGKey(0), (N, dim))
+    true_coefs = np.arange(1.0, dim + 1.0)
+    logits = np.sum(true_coefs * data, axis=-1)
+    labels = dist.Bernoulli(logits=logits).sample(random.PRNGKey(1))
+
+    if callable_prior:
+
+        def prior(name, shape):
+            return dist.Cauchy() if name == "b" else dist.Normal()
+    else:
+        prior = {"w": dist.Normal(), "b": dist.Cauchy()}
+
+    # Create a pre-initialized module for eager initialization
+    rng_key = random.PRNGKey(0)
+    linear_module = Linear(din=dim, dout=1, rngs=nnx.Rngs(params=rng_key))
+
+    # Extract parameters and state for inspection
+    _, params_state = nnx.split(linear_module, nnx.Param)
+    params_dict = nnx.to_pure_dict(params_state)
+
+    # Verify parameters were created correctly
+    assert "w" in params_dict
+    assert "b" in params_dict
+    assert params_dict["w"].shape == (dim, 1)
+    assert params_dict["b"].shape == (1,)
+
+    def model(data, labels=None):
+        # Use the pre-initialized module with eager initialization
+        nn = random_nnx_module("nn", linear_module, prior)
+        logits = nn(data).squeeze(-1)
+        return numpyro.sample("obs", dist.Bernoulli(logits=logits), obs=labels)
+
+    nuts_kernel = NUTS(model)
+    mcmc = MCMC(nuts_kernel, num_warmup=2, num_samples=2, progress_bar=False)
+    mcmc.run(random.PRNGKey(0), data, labels)
+    samples = mcmc.get_samples()
+    assert "nn/b" in samples
+    assert "nn/w" in samples