Pass value arg to optax, allowing use of reduce_on_plateau (#1974)

zmbc · web-flow · commit bdb3329c22f2 · 2025-03-10T16:06:44.000-04:00
* Pass value arg to optax, allowing use of reduce_on_plateau

* Address some PR comments

* Simplify, improve typing

* Address more PR comments

* Updates from PR comments

* Special-case the reduce on plateau scheduler for JIT test

* Pass loss value in SteinVI
diff --git a/numpyro/contrib/einstein/steinvi.py b/numpyro/contrib/einstein/steinvi.py
@@ -467,7 +467,7 @@ def update(self, state: SteinVIState, *args, **kwargs) -> SteinVIState:
             **kwargs,
             **self.static_kwargs,
         )
-        optim_state = self.optim.update(grads, optim_state)
+        optim_state = self.optim.update(grads, optim_state, value=loss_val)
         return SteinVIState(
             optim_state, rng_key, state.loss_temperature, state.repulsion_temperature
         ), loss_val
diff --git a/numpyro/optim.py b/numpyro/optim.py
@@ -9,7 +9,7 @@
 
 from collections import namedtuple
 from collections.abc import Callable
-from typing import Any
+from typing import Any, Optional, Protocol
 
 import jax
 from jax import jacfwd, lax, value_and_grad
@@ -50,11 +50,28 @@ def _wrapper(x):
         return value_and_grad(f, has_aux=True)(x)
 
 
+class UpdateExtraArgsFn(Protocol):
+    """An update function accepting additional keyword arguments."""
+
+    def __call__(
+        self,
+        arr: ArrayLike,
+        params: _Params,
+        state: _OptState,
+        **extra_args: Any,
+    ) -> _OptState:
+        """
+        Based on https://github.com/google-deepmind/optax/blob/2e66ce897e83b4901d37dcbb477a7432497848d6/optax/_src/base.py#L110-L147,
+        this protocol expresses an update function that *may* take extra arguments.
+        """
+
+
 class _NumPyroOptim(object):
     def __init__(self, optim_fn: Callable, *args, **kwargs) -> None:
         self.init_fn: Callable[[_Params], _IterOptState]
-        self.update_fn: Callable[[ArrayLike, _Params, _OptState], _OptState]
+        self.update_fn: UpdateExtraArgsFn
         self.get_params_fn: Callable[[_OptState], _Params]
+        self.update_with_value: bool = kwargs.pop("update_with_value", False)
         self.init_fn, self.update_fn, self.get_params_fn = optim_fn(*args, **kwargs)
 
     def init(self, params: _Params) -> _IterOptState:
@@ -67,7 +84,9 @@ def init(self, params: _Params) -> _IterOptState:
         opt_state = self.init_fn(params)
         return jnp.array(0), opt_state
 
-    def update(self, g: _Params, state: _IterOptState) -> _IterOptState:
+    def update(
+        self, g: _Params, state: _IterOptState, value: Optional[ArrayLike] = None
+    ) -> _IterOptState:
         """
         Gradient update for the optimizer.
 
@@ -76,7 +95,11 @@ def update(self, g: _Params, state: _IterOptState) -> _IterOptState:
         :return: new optimizer state after the update.
         """
         i, opt_state = state
-        opt_state = self.update_fn(i, g, opt_state)
+        if self.update_with_value:
+            assert value is not None
+            opt_state = self.update_fn(i, g, opt_state, value=value)
+        else:
+            opt_state = self.update_fn(i, g, opt_state)
         return i + 1, opt_state
 
     def eval_and_update(
@@ -104,7 +127,7 @@ def eval_and_update(
         (out, aux), grads = _value_and_grad(
             fn, x=params, forward_mode_differentiation=forward_mode_differentiation
         )
-        return (out, aux), self.update(grads, state)
+        return (out, aux), self.update(grads, state, value=out)
 
     def eval_and_stable_update(
         self,
@@ -128,7 +151,7 @@ def eval_and_stable_update(
         )
         out, state = lax.cond(
             jnp.isfinite(out) & jnp.isfinite(ravel_pytree(grads)[0]).all(),
-            lambda _: (out, self.update(grads, state)),
+            lambda _: (out, self.update(grads, state, value=out)),
             lambda _: (jnp.nan, state),
             None,
         )
@@ -178,7 +201,9 @@ def __init__(self, *args, clip_norm: float = 10.0, **kwargs) -> None:
         self.clip_norm = clip_norm
         super(ClippedAdam, self).__init__(optimizers.adam, *args, **kwargs)
 
-    def update(self, g: _Params, state: _IterOptState) -> _IterOptState:
+    def update(
+        self, g: _Params, state: _IterOptState, value: Optional[ArrayLike] = None
+    ) -> _IterOptState:
         i, opt_state = state
         # clip norm
         g = jax.tree.map(lambda g_: jnp.clip(g_, -self.clip_norm, self.clip_norm), g)
@@ -352,15 +377,26 @@ def init_fn(params: _Params) -> tuple[_Params, Any]:
         return params, opt_state
 
     def update_fn(
-        step: ArrayLike, grads: ArrayLike, state: tuple[_Params, Any]
+        step: ArrayLike,
+        grads: ArrayLike,
+        state: tuple[_Params, Any],
+        value: ArrayLike,
     ) -> tuple[_Params, Any]:
         params, opt_state = state
-        updates, opt_state = transformation.update(grads, opt_state, params)
+        updates, opt_state = optax.with_extra_args_support(transformation).update(
+            grads, opt_state, params, value=value
+        )
         updated_params = optax.apply_updates(params, updates)
         return updated_params, opt_state
 
     def get_params_fn(state: tuple[_Params, Any]) -> _Params:
         params, _ = state
         return params
 
-    return _NumPyroOptim(lambda x, y, z: (x, y, z), init_fn, update_fn, get_params_fn)
+    return _NumPyroOptim(
+        lambda x, y, z: (x, y, z),
+        init_fn,
+        update_fn,
+        get_params_fn,
+        update_with_value=True,
+    )
diff --git a/test/test_optimizers.py b/test/test_optimizers.py
@@ -12,21 +12,32 @@
 
 try:
     import optax
+    import optax.contrib
 
     # the optimizer test is parameterized by different optax optimizers, but we have
     # to define them here to ensure that `optax` is defined. pytest.mark.parameterize
     # decorators are run even if tests are skipped at the top of the file.
     optax_optimizers = [
-        (optax.adam, (1e-2,), {}),
+        (optax.adam, (1e-2,), {}, False),
         # clipped adam
-        (optax.chain, (optax.clip(10.0), optax.adam(1e-2)), {}),
-        (optax.adagrad, (1e-1,), {}),
+        (optax.chain, (optax.clip(10.0), optax.adam(1e-2)), {}, False),
+        (optax.adagrad, (1e-1,), {}, False),
         # SGD with momentum
-        (optax.sgd, (1e-2,), {"momentum": 0.9}),
-        (optax.rmsprop, (1e-2,), {"decay": 0.95}),
+        (optax.sgd, (1e-2,), {"momentum": 0.9}, False),
+        (optax.rmsprop, (1e-2,), {"decay": 0.95}, False),
         # RMSProp with momentum
-        (optax.rmsprop, (1e-4,), {"decay": 0.9, "momentum": 0.9}),
-        (optax.sgd, (1e-2,), {}),
+        (optax.rmsprop, (1e-4,), {"decay": 0.9, "momentum": 0.9}, False),
+        (optax.sgd, (1e-2,), {}, False),
+        # reduce learning rate on plateau
+        (
+            optax.chain,
+            (
+                optax.adam(1e-2),
+                optax.contrib.reduce_on_plateau(patience=5, accumulation_size=200),
+            ),
+            {},
+            True,
+        ),
     ]
 except ImportError:
     pytestmark = pytest.mark.skip(reason="optax is not installed")
@@ -41,24 +52,27 @@ def loss(params):
 def step(opt_state, optim):
     params = optim.get_params(opt_state)
     g = grad(loss)(params)
-    return optim.update(g, opt_state)
+    if optim.update_with_value:
+        return optim.update(g, opt_state, value=loss(params))
+    else:
+        return optim.update(g, opt_state)
 
 
 @pytest.mark.parametrize(
-    "optim_class, args, kwargs",
+    "optim_class, args, kwargs, uses_value_arg",
     [
-        (optim.Adam, (1e-2,), {}),
-        (optim.ClippedAdam, (1e-2,), {}),
-        (optim.Adagrad, (1e-1,), {}),
-        (optim.Momentum, (1e-2, 0.5), {}),
-        (optim.RMSProp, (1e-2, 0.95), {}),
-        (optim.RMSPropMomentum, (1e-4,), {}),
-        (optim.SGD, (1e-2,), {}),
+        (optim.Adam, (1e-2,), {}, False),
+        (optim.ClippedAdam, (1e-2,), {}, False),
+        (optim.Adagrad, (1e-1,), {}, False),
+        (optim.Momentum, (1e-2, 0.5), {}, False),
+        (optim.RMSProp, (1e-2, 0.95), {}, False),
+        (optim.RMSPropMomentum, (1e-4,), {}, False),
+        (optim.SGD, (1e-2,), {}, False),
     ]
     + optax_optimizers,
 )
 @pytest.mark.filterwarnings("ignore:.*tree_multimap:FutureWarning")
-def test_optim_multi_params(optim_class, args, kwargs):
+def test_optim_multi_params(optim_class, args, kwargs, uses_value_arg):
     params = {"x": jnp.array([1.0, 1.0, 1.0]), "y": jnp.array([-1, -1.0, -1.0])}
     opt = optim_class(*args, **kwargs)
     if not isinstance(opt, optim._NumPyroOptim):
@@ -73,20 +87,20 @@ def test_optim_multi_params(optim_class, args, kwargs):
 # note: this is somewhat of a bruteforce test. testing directly from
 # _NumpyroOptim would probably be better
 @pytest.mark.parametrize(
-    "optim_class, args, kwargs",
+    "optim_class, args, kwargs, uses_value_arg",
     [
-        (optim.Adam, (1e-2,), {}),
-        (optim.ClippedAdam, (1e-2,), {}),
-        (optim.Adagrad, (1e-1,), {}),
-        (optim.Momentum, (1e-2, 0.5), {}),
-        (optim.RMSProp, (1e-2, 0.95), {}),
-        (optim.RMSPropMomentum, (1e-4,), {}),
-        (optim.SGD, (1e-2,), {}),
+        (optim.Adam, (1e-2,), {}, False),
+        (optim.ClippedAdam, (1e-2,), {}, False),
+        (optim.Adagrad, (1e-1,), {}, False),
+        (optim.Momentum, (1e-2, 0.5), {}, False),
+        (optim.RMSProp, (1e-2, 0.95), {}, False),
+        (optim.RMSPropMomentum, (1e-4,), {}, False),
+        (optim.SGD, (1e-2,), {}, False),
     ]
     + optax_optimizers,
 )
 @pytest.mark.filterwarnings("ignore:.*tree_multimap:FutureWarning")
-def test_numpyrooptim_no_double_jit(optim_class, args, kwargs):
+def test_numpyrooptim_no_double_jit(optim_class, args, kwargs, uses_value_arg):
     opt = optim_class(*args, **kwargs)
     if not isinstance(opt, optim._NumPyroOptim):
         opt = optim.optax_to_numpyro(opt)
@@ -99,11 +113,18 @@ def my_fn(state, g):
         nonlocal my_fn_calls
         my_fn_calls += 1
 
-        state = opt.update(g, state)
+        if opt.update_with_value:
+            state = opt.update(g, state, value=0.01)
+        else:
+            state = opt.update(g, state)
         return state
 
     state = my_fn(state, jnp.ones(10) * 1.0)
     state = my_fn(state, jnp.ones(10) * 2.0)
     state = my_fn(state, jnp.ones(10) * 3.0)
 
-    assert my_fn_calls == 1
+    if uses_value_arg:
+        # Dtype is different on the first call vs the rest of the calls
+        assert my_fn_calls == 2
+    else:
+        assert my_fn_calls == 1

Original file line number	Diff line number	Diff line change
`@@ -467,7 +467,7 @@ def update(self, state: SteinVIState, args, *kwargs) -> SteinVIState:`
`467`	`467`	`**kwargs,`
`468`	`468`	`**self.static_kwargs,`
`469`	`469`	`)`
`470`		`- optim_state = self.optim.update(grads, optim_state)`
	`470`	`+ optim_state = self.optim.update(grads, optim_state, value=loss_val)`
`471`	`471`	`return SteinVIState(`
`472`	`472`	`optim_state, rng_key, state.loss_temperature, state.repulsion_temperature`
`473`	`473`	`), loss_val`