From caf0f3450dba209c0f3bdb6da4dd72409780dc93 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Tue, 14 Nov 2023 15:30:36 +0100 Subject: [PATCH 01/16] Added ReduceLROnPlateau and test. --- pymc/variational/callbacks.py | 76 ++++++++++++++++++++++++++++- tests/variational/test_callbacks.py | 30 ++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 72f0b4a2d..975b146d6 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -18,7 +18,9 @@ import numpy as np -__all__ = ["Callback", "CheckParametersConvergence", "Tracker"] +from pytensor import shared + +__all__ = ["Callback", "CheckParametersConvergence", "ReduceLROnPlateau", "Tracker"] class Callback: @@ -93,6 +95,78 @@ def flatten_shared(shared_list): return np.concatenate([sh.get_value().flatten() for sh in shared_list]) +class ReduceLROnPlateau(Callback): + """Reduce learning rate when the loss has stopped improving. + + This is inspired by Keras' homonymous callback: + https://github.com/keras-team/keras/blob/v2.14.0/keras/callbacks.py + + Parameters + ---------- + learning_rate: shared + shared variable containing the learning rate + factor: float + factor by which the learning rate will be reduced: `new_lr = lr * factor` + patience: int + number of epochs with no improvement after which learning rate will be reduced + min_lr: float + lower bound on the learning rate + cooldown: int + number of iterations to wait before resuming normal operation after lr has been reduced + verbose: bool + false: quiet, true: update messages + """ + + def __init__( + self, + initial_learning_rate: shared, + factor=0.1, + patience=10, + min_lr=1e-6, + cooldown=0, + ): + self.learning_rate = initial_learning_rate + self.factor = factor + self.patience = patience + self.min_lr = min_lr + self.cooldown = cooldown + + self.cooldown_counter = 0 + self.wait = 0 + self.best = float("inf") + self.old_lr = None + + def __call__(self, approx, loss_hist, i): + current = loss_hist[-1] + + if np.isinf(current): + return + + if self.in_cooldown(): + self.cooldown_counter -= 1 + self.wait = 0 + return + + if current < self.best: + self.best = current + self.wait = 0 + elif not np.isinf(self.best): + self.wait += 1 + if self.wait >= self.patience: + self.reduce_lr() + self.cooldown_counter = self.cooldown + self.wait = 0 + + def reduce_lr(self): + old_lr = float(self.learning_rate.get_value()) + if old_lr > self.min_lr: + new_lr = max(old_lr * self.factor, self.min_lr) + self.learning_rate.set_value(new_lr) + + def in_cooldown(self): + return self.cooldown_counter > 0 + + class Tracker(Callback): """ Helper class to record arbitrary stats during VI diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index a40acb1e9..84281767b 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -52,3 +52,33 @@ def test_tracker_callback(): tracker = pm.callbacks.Tracker(bad=lambda t: t) # bad signature with pytest.raises(TypeError): tracker(None, None, 1) + + +def test_reducelronplateau_callback(): + lr = pytensor.shared(0.1) + cb = pm.variational.callbacks.ReduceLROnPlateau( + initial_learning_rate=lr, + patience=1, + min_lr=0.001, + ) + cb(None, [float("inf")], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.1) + assert cb.best == float("inf") + cb(None, [float("inf"), 2], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.1) + assert cb.best == 2 + cb(None, [float("inf"), 2, 1], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.1) + assert cb.best == 1 + cb(None, [float("inf"), 2, 1, 99], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.01) + assert cb.best == 1 + cb(None, [float("inf"), 2, 1, 99, 0.9], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.01) + assert cb.best == 0.9 + cb(None, [float("inf"), 2, 1, 99, 0.9, 99], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.001) + assert cb.best == 0.9 + cb(None, [float("inf"), 2, 1, 99, 0.9, 99, 99], 1) + np.testing.assert_almost_equal(lr.get_value(), 0.001) + assert cb.best == 0.9 From 0afd4751d101fc23bfdb73b6dd62f08c665b7575 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Tue, 14 Nov 2023 16:13:57 +0100 Subject: [PATCH 02/16] Pass optimiser and not shared variable to callback --- pymc/variational/callbacks.py | 14 ++++++-------- tests/variational/test_callbacks.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 975b146d6..15d47dde2 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -18,8 +18,6 @@ import numpy as np -from pytensor import shared - __all__ = ["Callback", "CheckParametersConvergence", "ReduceLROnPlateau", "Tracker"] @@ -103,8 +101,8 @@ class ReduceLROnPlateau(Callback): Parameters ---------- - learning_rate: shared - shared variable containing the learning rate + optimiser: callable + PyMC optimiser factor: float factor by which the learning rate will be reduced: `new_lr = lr * factor` patience: int @@ -119,13 +117,13 @@ class ReduceLROnPlateau(Callback): def __init__( self, - initial_learning_rate: shared, + optimiser, factor=0.1, patience=10, min_lr=1e-6, cooldown=0, ): - self.learning_rate = initial_learning_rate + self.optimiser = optimiser self.factor = factor self.patience = patience self.min_lr = min_lr @@ -158,10 +156,10 @@ def __call__(self, approx, loss_hist, i): self.wait = 0 def reduce_lr(self): - old_lr = float(self.learning_rate.get_value()) + old_lr = float(self.optimiser.keywords["learning_rate"]) if old_lr > self.min_lr: new_lr = max(old_lr * self.factor, self.min_lr) - self.learning_rate.set_value(new_lr) + self.optimiser.keywords["learning_rate"] = new_lr def in_cooldown(self): return self.cooldown_counter > 0 diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 84281767b..a442b006f 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -55,30 +55,30 @@ def test_tracker_callback(): def test_reducelronplateau_callback(): - lr = pytensor.shared(0.1) + optimiser = pm.adam(learning_rate=0.1) cb = pm.variational.callbacks.ReduceLROnPlateau( - initial_learning_rate=lr, + optimiser=optimiser, patience=1, min_lr=0.001, ) cb(None, [float("inf")], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.1) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.1) assert cb.best == float("inf") cb(None, [float("inf"), 2], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.1) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.1) assert cb.best == 2 cb(None, [float("inf"), 2, 1], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.1) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.1) assert cb.best == 1 cb(None, [float("inf"), 2, 1, 99], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.01) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.01) assert cb.best == 1 cb(None, [float("inf"), 2, 1, 99, 0.9], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.01) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.01) assert cb.best == 0.9 cb(None, [float("inf"), 2, 1, 99, 0.9, 99], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.001) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.001) assert cb.best == 0.9 cb(None, [float("inf"), 2, 1, 99, 0.9, 99, 99], 1) - np.testing.assert_almost_equal(lr.get_value(), 0.001) + np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.001) assert cb.best == 0.9 From 9e27de575ab2116ba2c7aa57d0b59159db5f8f20 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Wed, 15 Nov 2023 08:00:44 +0100 Subject: [PATCH 03/16] American spelling. --- pymc/variational/callbacks.py | 13 ++++++------- tests/variational/test_callbacks.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 15d47dde2..8173200cb 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -13,7 +13,6 @@ # limitations under the License. import collections - from typing import Callable, Dict import numpy as np @@ -101,8 +100,8 @@ class ReduceLROnPlateau(Callback): Parameters ---------- - optimiser: callable - PyMC optimiser + optimizer: callable + PyMC optimizer factor: float factor by which the learning rate will be reduced: `new_lr = lr * factor` patience: int @@ -117,13 +116,13 @@ class ReduceLROnPlateau(Callback): def __init__( self, - optimiser, + optimizer, factor=0.1, patience=10, min_lr=1e-6, cooldown=0, ): - self.optimiser = optimiser + self.optimizer = optimizer self.factor = factor self.patience = patience self.min_lr = min_lr @@ -156,10 +155,10 @@ def __call__(self, approx, loss_hist, i): self.wait = 0 def reduce_lr(self): - old_lr = float(self.optimiser.keywords["learning_rate"]) + old_lr = float(self.optimizer.keywords["learning_rate"]) if old_lr > self.min_lr: new_lr = max(old_lr * self.factor, self.min_lr) - self.optimiser.keywords["learning_rate"] = new_lr + self.optimizer.keywords["learning_rate"] = new_lr def in_cooldown(self): return self.cooldown_counter > 0 diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index a442b006f..214698d05 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -55,30 +55,30 @@ def test_tracker_callback(): def test_reducelronplateau_callback(): - optimiser = pm.adam(learning_rate=0.1) + optimizer = pm.adam(learning_rate=0.1) cb = pm.variational.callbacks.ReduceLROnPlateau( - optimiser=optimiser, + optimizer=optimizer, patience=1, min_lr=0.001, ) cb(None, [float("inf")], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.1) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) assert cb.best == float("inf") cb(None, [float("inf"), 2], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.1) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) assert cb.best == 2 cb(None, [float("inf"), 2, 1], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.1) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) assert cb.best == 1 cb(None, [float("inf"), 2, 1, 99], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.01) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.01) assert cb.best == 1 cb(None, [float("inf"), 2, 1, 99, 0.9], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.01) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.01) assert cb.best == 0.9 cb(None, [float("inf"), 2, 1, 99, 0.9, 99], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.001) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) assert cb.best == 0.9 cb(None, [float("inf"), 2, 1, 99, 0.9, 99, 99], 1) - np.testing.assert_almost_equal(optimiser.keywords["learning_rate"], 0.001) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) assert cb.best == 0.9 From d66d523b4c53f68a84ba41874b4f4cd754ac1d5f Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Wed, 15 Nov 2023 08:01:18 +0100 Subject: [PATCH 04/16] Linting. --- pymc/variational/callbacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 8173200cb..68be64a9f 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -13,6 +13,7 @@ # limitations under the License. import collections + from typing import Callable, Dict import numpy as np From a2c27bde894a27bb46b7cad520358c36d972015d Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Wed, 15 Nov 2023 08:06:11 +0100 Subject: [PATCH 05/16] Removed unused variable. --- pymc/variational/callbacks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 68be64a9f..30f07aa9e 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -132,7 +132,6 @@ def __init__( self.cooldown_counter = 0 self.wait = 0 self.best = float("inf") - self.old_lr = None def __call__(self, approx, loss_hist, i): current = loss_hist[-1] From 74850bc0abc1a9aa36e41cdd8d516fb9864c5f80 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Thu, 16 Nov 2023 10:20:10 +0100 Subject: [PATCH 06/16] Updated test to use all optimizers. --- tests/variational/test_callbacks.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 214698d05..4511f12ce 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -54,8 +54,20 @@ def test_tracker_callback(): tracker(None, None, 1) -def test_reducelronplateau_callback(): - optimizer = pm.adam(learning_rate=0.1) +@pytest.mark.parametrize( + "optimizer", + [ + pm.sgd(learning_rate=0.1), + pm.momentum(learning_rate=0.1), + pm.nesterov_momentum(learning_rate=0.1), + pm.adagrad(learning_rate=0.1), + pm.rmsprop(learning_rate=0.1), + pm.adadelta(learning_rate=0.1), + pm.adam(learning_rate=0.1), + pm.adamax(learning_rate=0.1), + ], +) +def test_reducelronplateau_callback(optimizer): cb = pm.variational.callbacks.ReduceLROnPlateau( optimizer=optimizer, patience=1, From d2abbcda047cc8e118da9abce8caf94a88282c32 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Thu, 16 Nov 2023 10:51:31 +0100 Subject: [PATCH 07/16] Added baseclass and ExponentialDecay. --- pymc/variational/callbacks.py | 79 +++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 30f07aa9e..0eda93482 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -93,8 +93,56 @@ def flatten_shared(shared_list): return np.concatenate([sh.get_value().flatten() for sh in shared_list]) -class ReduceLROnPlateau(Callback): - """Reduce learning rate when the loss has stopped improving. +class LearningRateScheduler(Callback): + """Baseclass for learning rate schedulers.""" + + def __init__(self, optimizer): + self.optimizer = optimizer + + def __call__(self, approx, loss_hist, i): + raise NotImplementedError("Must be implemented in subclass.") + + def _set_new_lr(self, new_lr): + self.optimizer.keywords["learning_rate"] = new_lr + + +class ExponentialScheduler(LearningRateScheduler): + """ + Exponentially decays the learning rate. + + Parameters + ---------- + decay_steps : int + Number of steps at which the learning rate decay happens. + decay_rate : float + Rate of decay. + min_lr: float + lower bound on the learning rate + staircase : bool + If True, decay the learning rate at discrete intervals. + """ + + def __init__(self, optimizer, decay_steps, decay_rate, min_lr=1e-6, staircase=False): + super().__init__(optimizer) + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + self.min_lr = min_lr + + self.initial_learning_rate = float(self.optimizer.keywords["learning_rate"]) + + def __call__(self, approx, loss_hist, i): + if self.staircase: + new_lr = self.initial_learning_rate * self.decay_rate ** (i // self.decay_steps) + else: + new_lr = self.initial_learning_rate * self.decay_rate ** (i / self.decay_steps) + if new_lr >= self.min_lr: + self._set_new_lr(new_lr) + + +class ReduceLROnPlateau(LearningRateScheduler): + """ + Reduce learning rate when the loss has stopped improving. This is inspired by Keras' homonymous callback: https://github.com/keras-team/keras/blob/v2.14.0/keras/callbacks.py @@ -111,8 +159,6 @@ class ReduceLROnPlateau(Callback): lower bound on the learning rate cooldown: int number of iterations to wait before resuming normal operation after lr has been reduced - verbose: bool - false: quiet, true: update messages """ def __init__( @@ -123,23 +169,31 @@ def __init__( min_lr=1e-6, cooldown=0, ): - self.optimizer = optimizer + super().__init__(optimizer) self.factor = factor self.patience = patience self.min_lr = min_lr self.cooldown = cooldown - self.cooldown_counter = 0 self.wait = 0 self.best = float("inf") + def _in_cooldown(self): + return self.cooldown_counter > 0 + + def _reduce_lr(self): + old_lr = float(self.optimizer.keywords["learning_rate"]) + new_lr = max(old_lr * self.factor, self.min_lr) + if new_lr >= self.min_lr: + self._set_new_lr(new_lr) + def __call__(self, approx, loss_hist, i): current = loss_hist[-1] if np.isinf(current): return - if self.in_cooldown(): + if self._in_cooldown(): self.cooldown_counter -= 1 self.wait = 0 return @@ -150,19 +204,10 @@ def __call__(self, approx, loss_hist, i): elif not np.isinf(self.best): self.wait += 1 if self.wait >= self.patience: - self.reduce_lr() + self._reduce_lr() self.cooldown_counter = self.cooldown self.wait = 0 - def reduce_lr(self): - old_lr = float(self.optimizer.keywords["learning_rate"]) - if old_lr > self.min_lr: - new_lr = max(old_lr * self.factor, self.min_lr) - self.optimizer.keywords["learning_rate"] = new_lr - - def in_cooldown(self): - return self.cooldown_counter > 0 - class Tracker(Callback): """ From 3068e0c3ffbcd4c6d110c16f49d3109da26d53a9 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Thu, 16 Nov 2023 10:57:09 +0100 Subject: [PATCH 08/16] Added tests. --- pymc/variational/callbacks.py | 2 +- tests/variational/test_callbacks.py | 45 ++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 0eda93482..9b0a97033 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -106,7 +106,7 @@ def _set_new_lr(self, new_lr): self.optimizer.keywords["learning_rate"] = new_lr -class ExponentialScheduler(LearningRateScheduler): +class ExponentialDecay(LearningRateScheduler): """ Exponentially decays the learning rate. diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 4511f12ce..80d8e4b3c 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -67,7 +67,7 @@ def test_tracker_callback(): pm.adamax(learning_rate=0.1), ], ) -def test_reducelronplateau_callback(optimizer): +def test_reduce_lr_on_plateau(optimizer): cb = pm.variational.callbacks.ReduceLROnPlateau( optimizer=optimizer, patience=1, @@ -76,21 +76,52 @@ def test_reducelronplateau_callback(optimizer): cb(None, [float("inf")], 1) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) assert cb.best == float("inf") - cb(None, [float("inf"), 2], 1) + cb(None, [float("inf"), 2], 2) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) assert cb.best == 2 - cb(None, [float("inf"), 2, 1], 1) + cb(None, [float("inf"), 2, 1], 3) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) assert cb.best == 1 - cb(None, [float("inf"), 2, 1, 99], 1) + cb(None, [float("inf"), 2, 1, 99], 4) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.01) assert cb.best == 1 - cb(None, [float("inf"), 2, 1, 99, 0.9], 1) + cb(None, [float("inf"), 2, 1, 99, 0.9], 5) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.01) assert cb.best == 0.9 - cb(None, [float("inf"), 2, 1, 99, 0.9, 99], 1) + cb(None, [float("inf"), 2, 1, 99, 0.9, 99], 6) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) assert cb.best == 0.9 - cb(None, [float("inf"), 2, 1, 99, 0.9, 99, 99], 1) + cb(None, [float("inf"), 2, 1, 99, 0.9, 99, 99], 7) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) assert cb.best == 0.9 + + +@pytest.mark.parametrize( + "optimizer", + [ + pm.sgd(learning_rate=0.1), + pm.momentum(learning_rate=0.1), + pm.nesterov_momentum(learning_rate=0.1), + pm.adagrad(learning_rate=0.1), + pm.rmsprop(learning_rate=0.1), + pm.adadelta(learning_rate=0.1), + pm.adam(learning_rate=0.1), + pm.adamax(learning_rate=0.1), + ], +) +def test_exponential_decay(optimizer): + cb = pm.variational.callbacks.ExponentialDecay( + optimizer=optimizer, + decay_steps=1, + decay_rate=0.1, + min_lr=0.001, + ) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.1) + cb(None, [float("inf")], 1) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.01) + cb(None, [float("inf"), 2, 2], 2) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) + cb(None, [float("inf"), 2, 2, 2], 3) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) + cb(None, [float("inf"), 2, 2, 2, 2], 4) + np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) From 86f40972cf22e5b448d8de2ef35f72232f00389c Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Thu, 16 Nov 2023 11:06:22 +0100 Subject: [PATCH 09/16] Docstrings. --- pymc/variational/callbacks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index 9b0a97033..ac013ebfc 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -13,7 +13,6 @@ # limitations under the License. import collections - from typing import Callable, Dict import numpy as np @@ -110,6 +109,9 @@ class ExponentialDecay(LearningRateScheduler): """ Exponentially decays the learning rate. + This is inspired by Keras' homonymous callback: + https://github.com/keras-team/keras/blob/v2.14.0/keras/optimizers/schedules/learning_rate_schedule.py + Parameters ---------- decay_steps : int From ba3e3f21779390afc2910a572a95b74953088f5f Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Thu, 16 Nov 2023 11:06:55 +0100 Subject: [PATCH 10/16] Linting. --- pymc/variational/callbacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index ac013ebfc..aabca2641 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -13,6 +13,7 @@ # limitations under the License. import collections + from typing import Callable, Dict import numpy as np From 8bf6241833b9074a67e386bb380a9a0e9614c3d4 Mon Sep 17 00:00:00 2001 From: Alvaro Perez Date: Thu, 16 Nov 2023 11:11:42 +0100 Subject: [PATCH 11/16] Added new callbacks to __all__. --- pymc/variational/callbacks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py index aabca2641..4f609efba 100644 --- a/pymc/variational/callbacks.py +++ b/pymc/variational/callbacks.py @@ -18,7 +18,13 @@ import numpy as np -__all__ = ["Callback", "CheckParametersConvergence", "ReduceLROnPlateau", "Tracker"] +__all__ = [ + "Callback", + "CheckParametersConvergence", + "ExponentialDecay", + "ReduceLROnPlateau", + "Tracker", +] class Callback: From 2700c2eb13f7d2a5a9e1b57c69630ef97b64abde Mon Sep 17 00:00:00 2001 From: jessegrabowski Date: Sun, 3 Dec 2023 00:04:07 +0100 Subject: [PATCH 12/16] Implement schedulers as function wrappers --- pymc/variational/opvi.py | 7 +- pymc/variational/updates.py | 485 +++++++++++++++++++++++----- tests/variational/test_callbacks.py | 66 ++-- tests/variational/test_updates.py | 145 ++++++++- 4 files changed, 571 insertions(+), 132 deletions(-) diff --git a/pymc/variational/opvi.py b/pymc/variational/opvi.py index 99261f026..7a2e1625c 100644 --- a/pymc/variational/opvi.py +++ b/pymc/variational/opvi.py @@ -251,6 +251,7 @@ def updates( if more_updates is None: more_updates = dict() resulting_updates = ObjectiveUpdates() + if self.test_params: self.add_test_updates( resulting_updates, @@ -313,10 +314,14 @@ def add_obj_updates( obj_target = self( obj_n_mc, more_obj_params=more_obj_params, more_replacements=more_replacements ) + grads = pm.updates.get_or_compute_grads(obj_target, self.obj_params + more_obj_params) if total_grad_norm_constraint is not None: grads = pm.total_norm_constraint(grads, total_grad_norm_constraint) - updates.update(obj_optimizer(grads, self.obj_params + more_obj_params)) + + # Pass the loss plus the gradients to the optimizer, so that schedulers can use the loss if need. + updates.update(obj_optimizer((obj_target, grads), self.obj_params + more_obj_params)) + if self.op.returns_loss: updates.loss = obj_target diff --git a/pymc/variational/updates.py b/pymc/variational/updates.py index a6d049608..bd47046b2 100644 --- a/pymc/variational/updates.py +++ b/pymc/variational/updates.py @@ -109,7 +109,8 @@ """ from collections import OrderedDict -from functools import partial +from functools import partial, wraps +from typing import Callable import numpy as np import pytensor @@ -173,17 +174,101 @@ def get_or_compute_grads(loss_or_grads, params): ) return loss_or_grads else: - return pytensor.grad(loss_or_grads, params) + grads = pytensor.grad(loss_or_grads, params) + for grad, param in zip(grads, params): + grad.name = f'd_loss/d_{param.name}' + return grads -def _get_call_kwargs(_locals_): - _locals_ = _locals_.copy() - _locals_.pop("loss_or_grads") - _locals_.pop("params") - return _locals_ +def _input_to_shared_variable(x, name): + if isinstance(x, pt.sharedvar.SharedVariable): + return x + return pytensor.shared(x, name=name) -def sgd(loss_or_grads=None, params=None, learning_rate=1e-3): +def _find_variable_among_args_kwargs(args, kwargs, name): + """ + Helper function to find a variable among args and kwargs. + + Notes + ----- + Assumes that the variable being searched for is either a kwarg or the first arg. + """ + + variable = kwargs.pop(name, None) + if not variable: + variable = args.pop(0) if len(args) > 0 else None + return variable + + +def _partial_initialization_wrapper(optimizer): + """ + Functional wrapper to allow optimizer to be called without both loss_or_grads and params + + Parameters + ---------- + optimizer: callable + Optimizer function to wrap + + Returns + ------- + optimizer: callable + Optimizer function that returns itself partially initialized if called without both loss_or_grads and params + """ + + @wraps(optimizer) + def make_partial_if_necessary(*args, **kwargs): + args = list(args) + loss_or_grads = _find_variable_among_args_kwargs(args, kwargs, 'loss_or_grads') + params = _find_variable_among_args_kwargs(args, kwargs, 'params') + + if loss_or_grads is None and params is None: + return partial(optimizer, **kwargs) + elif loss_or_grads is None or params is None: + raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") + + return optimizer(loss_or_grads=loss_or_grads, params=params, **kwargs) + + return make_partial_if_necessary + + +def _handle_loss_and_grad_input_wrapper(optimizer): + """ + Functional wrapper to allow optimizer to take a tuple of (loss, grads) as input, and either discard the loss or + pass it through. + + Adds a keyword argument to the wrapped optimizer, `discard_loss`, which if True, will discard the loss and only + return the updates. If False, the optimizer will return both the updates and the loss. + + Parameters + ---------- + optimizer: callable + Optimizer function to wrap + + Returns + ------- + optimizer: callable + Wrapped optimizer function + """ + + @wraps(optimizer) + def discard_or_pass_through_loss_optimizer(loss_or_grads, params, discard_loss=True, *args, **kwargs): + if isinstance(loss_or_grads, tuple): + loss, grads = loss_or_grads + updates = optimizer(loss_or_grads=grads, params=params, *args, **kwargs) + else: + discard_loss, loss = True, None + updates = optimizer(loss_or_grads=loss_or_grads, params=params, *args, **kwargs) + + if discard_loss: + return updates + else: + return updates, loss + + return discard_or_pass_through_loss_optimizer + + +def _sgd(loss_or_grads=None, params=None, *, learning_rate=1e-3): """Stochastic Gradient Descent (SGD) updates Generates update expressions of the form: @@ -223,19 +308,20 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3): >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(sgd, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): - updates[param] = param - learning_rate * grad + updated_param = param - learning_rate * grad + updated_param.name = f'{param.name}__updated' + updates[param] = updated_param return updates +sgd = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_sgd)) + + def apply_momentum(updates, params=None, momentum=0.9): """Returns a modified update dictionary including momentum @@ -275,15 +361,21 @@ def apply_momentum(updates, params=None, momentum=0.9): for param in params: value = param.get_value(borrow=True) - velocity = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) - x = momentum * velocity + updates[param] - updates[velocity] = x - param - updates[param] = x + velocity = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name='velocity') + + updated_param = momentum * velocity + updates[param] + updated_param.name = f'{param}__updated' + + updated_velocity = updated_param - param + updated_velocity.name = 'velocity__updated' + + updates[velocity] = updated_velocity + updates[param] = updated_param return updates -def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): +def _momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with momentum Generates update expressions of the form: @@ -335,14 +427,14 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(pm.updates.momentum, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") updates = sgd(loss_or_grads, params, learning_rate) + return apply_momentum(updates, momentum=momentum) +momentum = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_momentum)) + + def apply_nesterov_momentum(updates, params=None, momentum=0.9): """Returns a modified update dictionary including Nesterov momentum @@ -389,14 +481,20 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9): for param in params: value = param.get_value(borrow=True) velocity = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) - x = momentum * velocity + updates[param] - param - updates[velocity] = x - updates[param] = momentum * x + updates[param] + + updated_velocity = momentum * velocity + updates[param] - param + updated_velocity.name = 'velocity__updated' + + updated_param = momentum * updated_velocity + updates[param] + updated_param.name = f'{param.name}__updated' + + updates[velocity] = updated_velocity + updates[param] = updated_param return updates -def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): +def _nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Generates update expressions of the form: @@ -453,15 +551,15 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(nesterov_momentum, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") + updates = sgd(loss_or_grads, params, learning_rate) return apply_nesterov_momentum(updates, momentum=momentum) -def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): +nesterov_momentum = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_nesterov_momentum)) + + +def _adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): """Adagrad updates Scale learning rates by dividing with the square root of accumulated @@ -521,24 +619,30 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(adagrad, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) - accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) - accu_new = accu + grad**2 + accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, + name='gradient_squares') + accu_new = accu + grad ** 2 + accu_new.name = 'gradient_squares__updated' + updates[accu] = accu_new - updates[param] = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon)) + + updated_param = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon)) + updated_param.name = f'{param.name}__updated' + + updates[param] = updated_param return updates -def adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilon=0.1, n_win=10): +adagrad = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_adagrad)) + + +def _adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilon=0.1, n_win=10): """Returns a function that returns parameter updates. Instead of accumulated estimate, uses running window @@ -560,30 +664,39 @@ def adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilon OrderedDict A dictionary mapping each parameter to its update expression """ - if loss_or_grads is None and params is None: - return partial(adagrad_window, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): - i = pytensor.shared(pm.floatX(0)) + i = pytensor.shared(pm.floatX(0), name='window_idx') i_int = i.astype("int32") value = param.get_value(borrow=True) - accu = pytensor.shared(np.zeros(value.shape + (n_win,), dtype=value.dtype)) + + accu = pytensor.shared(np.zeros(value.shape + (n_win,), dtype=value.dtype), + name='gradient_squares') # Append squared gradient vector to accu_new - accu_new = pt.set_subtensor(accu[..., i_int], grad**2) + accu_new = pt.set_subtensor(accu[..., i_int], grad ** 2) + accu_new.name = 'gradient_squares__updated' + i_new = pt.switch((i + 1) < n_win, i + 1, 0) + i_new.name = 'window_idx__updated' + updates[accu] = accu_new updates[i] = i_new accu_sum = accu_new.sum(axis=-1) - updates[param] = param - (learning_rate * grad / pt.sqrt(accu_sum + epsilon)) + + param_updated = param - (learning_rate * grad / pt.sqrt(accu_sum + epsilon)) + param_updated.name = f'{param.name}__updated' + updates[param] = param_updated + return updates -def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon=1e-6): +adagrad_window = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_adagrad_window)) + + +def _rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Scale learning rates by dividing with the moving average of the root mean @@ -644,10 +757,6 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(rmsprop, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() @@ -656,15 +765,25 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon for param, grad in zip(params, grads): value = param.get_value(borrow=True) - accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) - accu_new = rho * accu + (one - rho) * grad**2 + accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, + name='gradient_squares') + + accu_new = rho * accu + (one - rho) * grad ** 2 + accu_new.name = 'gradient_squares__updated' + updates[accu] = accu_new - updates[param] = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon)) + + param_updated = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon)) + param_updated.name = f'{param.name}__updated' + updates[param] = param_updated return updates -def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsilon=1e-6): +rmsprop = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_rmsprop)) + + +def _adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsilon=1e-6): r"""Adadelta updates Scale learning rates by the ratio of accumulated gradients to accumulated @@ -734,10 +853,6 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(adadelta, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() @@ -747,14 +862,16 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil for param, grad in zip(params, grads): value = param.get_value(borrow=True) # accu: accumulate gradient magnitudes - accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) + + accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, + name='gradient_squares') # delta_accu: accumulate update magnitudes (recursively!) delta_accu = pytensor.shared( - np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name='' ) # update accu (as in rmsprop) - accu_new = rho * accu + (one - rho) * grad**2 + accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new # compute parameter update, using the 'old' delta_accu @@ -762,14 +879,17 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil updates[param] = param - learning_rate * update # update delta_accu (as accu, but accumulating updates) - delta_accu_new = rho * delta_accu + (one - rho) * update**2 + delta_accu_new = rho * delta_accu + (one - rho) * update ** 2 updates[delta_accu] = delta_accu_new return updates -def adam( - loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8 +adadelta = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_adadelta)) + + +def _adam( + loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8 ): """Adam updates @@ -824,39 +944,48 @@ def adam( >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(adam, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") all_grads = get_or_compute_grads(loss_or_grads, params) - t_prev = pytensor.shared(pm.pytensorf.floatX(0.0)) + t_prev = pytensor.shared(pm.pytensorf.floatX(0.0), name='t') updates = OrderedDict() # Using pytensor constant to prevent upcasting of float32 one = pt.constant(1) t = t_prev + 1 - a_t = learning_rate * pt.sqrt(one - beta2**t) / (one - beta1**t) + t.name = 't__updated' + a_t = learning_rate * pt.sqrt(one - beta2 ** t) / (one - beta1 ** t) + a_t.name = 'a' for param, g_t in zip(params, all_grads): + name = param.name value = param.get_value(borrow=True) - m_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) - v_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) + m_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_m') + v_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_v') m_t = beta1 * m_prev + (one - beta1) * g_t - v_t = beta2 * v_prev + (one - beta2) * g_t**2 + m_t.name = f'{name}_m__updated' + v_t = beta2 * v_prev + (one - beta2) * g_t ** 2 + v_t.name = f'{name}_v__updated' + step = a_t * m_t / (pt.sqrt(v_t) + epsilon) + step.name = f'{name}_step_size' updates[m_prev] = m_t updates[v_prev] = v_t - updates[param] = param - step + + param_updated = param - step + param_updated.name = f'{name}__updated' + updates[param] = param_updated updates[t_prev] = t return updates -def adamax( - loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8 +adam = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_adam)) + + +def _adamax( + loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8 ): """Adamax updates @@ -908,37 +1037,48 @@ def adamax( >>> isinstance(updates, dict) True """ - if loss_or_grads is None and params is None: - return partial(adamax, **_get_call_kwargs(locals())) - elif loss_or_grads is None or params is None: - raise ValueError("Please provide both `loss_or_grads` and `params` to get updates") all_grads = get_or_compute_grads(loss_or_grads, params) - t_prev = pytensor.shared(pm.pytensorf.floatX(0.0)) + t_prev = pytensor.shared(pm.pytensorf.floatX(0.0), name='t') updates = OrderedDict() # Using pytensor constant to prevent upcasting of float32 one = pt.constant(1) t = t_prev + 1 - a_t = learning_rate / (one - beta1**t) + t.name = 't__updated' + + a_t = learning_rate / (one - beta1 ** t) + a_t.name = 'a' for param, g_t in zip(params, all_grads): + name = param.name value = param.get_value(borrow=True) - m_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) - u_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) + m_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_m') + u_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_u') m_t = beta1 * m_prev + (one - beta1) * g_t + m_t.name = f'{name}_m__updated' + u_t = pt.maximum(beta2 * u_prev, abs(g_t)) + u_t.name = f'{name}_u__updated' + step = a_t * m_t / (u_t + epsilon) + step.name = f'{name}_step_size' updates[m_prev] = m_t updates[u_prev] = u_t - updates[param] = param - step + + param_updated = param - step + param_updated.name = f'{name}__updated' + updates[param] = param_updated updates[t_prev] = t return updates +adamax = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_adamax)) + + def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7): """Max weight norm constraints and gradient clipping @@ -1072,7 +1212,7 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False learning with neural networks. In Advances in Neural Information Processing Systems (pp. 3104-3112). """ - norm = pt.sqrt(sum(pt.sum(tensor**2) for tensor in tensor_vars)) + norm = pt.sqrt(sum(pt.sum(tensor ** 2) for tensor in tensor_vars)) dtype = np.dtype(pytensor.config.floatX).type target_norm = pt.clip(norm, 0, dtype(max_norm)) multiplier = target_norm / (dtype(epsilon) + norm) @@ -1082,3 +1222,172 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False return tensor_vars_scaled, norm else: return tensor_vars_scaled + + +def _handle_time_updates(updates): + """ + Create a shared time variable and its update if one does not already exist in the updates dictionary, otherwise + extract it and delete the entry from the updates dictionary. + + Parameters + ---------- + updates: dict + update dictionary created by an optimizer function + + Returns + ------- + t: pt.shared.SharedVariable + shared variable representing the current time step + new_t: pt.shared.SharedVariable + shared variable representing the next time step + + Notes + ----- + This function potentially modifies the update dictionary in-place by deleting the entry for the time variable, if + it exists. This is done to ensure that the time variable is always the last update applied. All schedulers need + to add this update back in to the update dictionary before returning it. + """ + old_values = list(updates.keys()) + old_names = [shared_var.name for shared_var in old_values] + + t_idx = old_names.index('t') if 't' in old_names else None + if t_idx is None: + t = pytensor.shared(pm.pytensorf.floatX(0.0), name='t') + new_t = t + 1 + new_t.name = 't__updated' + else: + # If t is already present, we will reuse it, but we also need to delete it from the update dict temporarily. + # We always want it to be the last update applied. + t = old_values[t_idx] + new_t = updates[t] + del updates[t] + + return t, new_t + + +def exponential_decay_scheduler(optimizer: Callable, decay_steps: int, decay_rate: float, min_lr: float = 1e-6, + staircase: bool = False): + """ + Returns a new optimizer that applies exponential decay to the learning rate. + + Parameters + ---------- + optimizer: Callable + Optimizer to apply exponential decay to + decay_steps: int + Number of steps between application of a decay. + decay_rate: float + Decay factor used to compute new learning rate, with new_lr = max(lr * decay_rate, min_lr). Must be between 0 + and 1. + min_lr: float + Minimum learning rate, after which no additional decay is applied. Defaults to 1e-6. + staircase: bool + If True, learning rate is decayed in discrete intervals, otherwise decay is applied continuously. + Defaults to False. + + Returns + ------- + scheduled_optimizer: Callable + Optimizer with exponential decay applied to learning rate. + """ + if not 0 < decay_rate <= 1: + raise ValueError('decay_rate must be between 0 and 1') + + kwargs = optimizer.keywords + _initial_lr = pm.floatX(optimizer.keywords['learning_rate']) + + initial_lr = pt.constant(_initial_lr, name='initial_learning_rate') + shared_lr = _input_to_shared_variable(_initial_lr, 'learning_rate') + kwargs['learning_rate'] = shared_lr + + @wraps(optimizer) + def optimizer_with_exponential_decay(loss_or_grads, params, *args, **kwargs): + updates = optimizer(loss_or_grads, params, *args, **kwargs) + t, new_t = _handle_time_updates(updates) + + if staircase: + new_lr = initial_lr * decay_rate ** (t // decay_steps) + else: + new_lr = initial_lr * decay_rate ** (t / decay_steps) + + new_lr = pt.maximum(new_lr, min_lr) + + new_lr.name = 'learning_rate__updated' + updates[shared_lr] = new_lr + updates[t] = new_t + + return updates + + return optimizer_with_exponential_decay + + +def reduce_lr_on_plateau_scheduler(optimizer, factor=0.1, patience=10, min_lr=1e-6, cooldown=0): + kwargs = optimizer.keywords + _initial_lr = pm.floatX(optimizer.keywords['learning_rate']) + shared_lr = _input_to_shared_variable(_initial_lr, 'learning_rate') + kwargs['learning_rate'] = shared_lr + + @wraps(optimizer) + def optimizer_with_reduce_lr_on_plateau(loss_or_grads, params, *args, **kwargs): + updates, loss = optimizer(loss_or_grads, params, *args, discard_loss=False, **kwargs) + + cooldown_counter = pytensor.shared(np.zeros((), dtype='int32'), name='cooldown_counter') + wait = pytensor.shared(np.zeros((), dtype='int32'), name='wait') + best_loss = pytensor.shared(np.inf, name='best_loss') + + loss_is_inf = pt.isinf(loss) + + in_cooldown = pt.gt(cooldown_counter, 0) + improving_loss = pt.lt(loss, best_loss) + patience_exceeded = pt.ge(wait, patience) + + updated_best_loss = pt.switch(loss_is_inf, + best_loss, + pt.switch(improving_loss, + loss, + best_loss)) + + updated_best_loss.name = 'best_loss__updated' + + updated_cooldown_counter = pt.switch(loss_is_inf, + cooldown_counter, + pt.switch(in_cooldown, + cooldown_counter - 1, + pt.switch(improving_loss, + cooldown_counter, + pt.switch(patience_exceeded, + cooldown, + cooldown_counter)))) + updated_cooldown_counter.name = 'cooldown_counter__updated' + + updated_lr = pt.switch(loss_is_inf, + shared_lr, + pt.switch(in_cooldown, + shared_lr, + pt.switch(improving_loss, + shared_lr, + pt.switch(patience_exceeded, + pt.maximum(min_lr, shared_lr * factor), + shared_lr)))) + + updated_lr.name = 'learning_rate__updated' + + updated_wait = pt.switch(loss_is_inf, + wait, + pt.switch(in_cooldown, + 0, + pt.switch(improving_loss, + 0, + pt.switch(patience_exceeded, + 0, + wait + 1)))) + updated_wait.name = 'wait__updated' + + updates[best_loss] = updated_best_loss + updates[cooldown_counter] = updated_cooldown_counter + updates[wait] = updated_wait + updates[shared_lr] = updated_lr + + return updates + + return optimizer_with_reduce_lr_on_plateau diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 80d8e4b3c..799102ef5 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -54,22 +54,19 @@ def test_tracker_callback(): tracker(None, None, 1) -@pytest.mark.parametrize( - "optimizer", - [ - pm.sgd(learning_rate=0.1), - pm.momentum(learning_rate=0.1), - pm.nesterov_momentum(learning_rate=0.1), - pm.adagrad(learning_rate=0.1), - pm.rmsprop(learning_rate=0.1), - pm.adadelta(learning_rate=0.1), - pm.adam(learning_rate=0.1), - pm.adamax(learning_rate=0.1), - ], -) +OPTIMIZERS = [pm.sgd, + pm.momentum, + pm.nesterov_momentum, + pm.adagrad, + pm.rmsprop, + pm.adadelta, + pm.adam, + pm.adamax] + +@pytest.mark.parametrize("optimizer", OPTIMIZERS) def test_reduce_lr_on_plateau(optimizer): cb = pm.variational.callbacks.ReduceLROnPlateau( - optimizer=optimizer, + optimizer=optimizer(learning_rate=0.1), patience=1, min_lr=0.001, ) @@ -96,19 +93,7 @@ def test_reduce_lr_on_plateau(optimizer): assert cb.best == 0.9 -@pytest.mark.parametrize( - "optimizer", - [ - pm.sgd(learning_rate=0.1), - pm.momentum(learning_rate=0.1), - pm.nesterov_momentum(learning_rate=0.1), - pm.adagrad(learning_rate=0.1), - pm.rmsprop(learning_rate=0.1), - pm.adadelta(learning_rate=0.1), - pm.adam(learning_rate=0.1), - pm.adamax(learning_rate=0.1), - ], -) +@pytest.mark.parametrize("optimizer", OPTIMIZERS) def test_exponential_decay(optimizer): cb = pm.variational.callbacks.ExponentialDecay( optimizer=optimizer, @@ -125,3 +110,30 @@ def test_exponential_decay(optimizer): np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) cb(None, [float("inf"), 2, 2, 2, 2], 4) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) + + +def test_learning_rate_scheduler(): + X = np.random.normal(size=(100, 5)) + beta = np.random.normal(size=(5,)) + alpha = np.random.normal() + noise = np.random.normal(size=(100,)) + y = alpha + X @ beta + noise + + optimizer = pm.sgd() + scheduler = pm.callbacks.ExponentialDecay(optimizer=optimizer, decay_steps=10, decay_rate=0.5) + + with pm.Model() as mod: + b = pm.Normal('b', shape=(5,)) + a = pm.Normal('a') + sigma = pm.HalfNormal('sigma') + + mu = a + X @ b + y_hat = pm.Normal('y_hat', mu, sigma, observed=y) + advi = pm.ADVI(optimizer=optimizer) + + tracker = pm.callbacks.Tracker(mean=advi.approx.mean.eval) + approx = advi.fit(100, callbacks=[scheduler, tracker]) + + mean_history = tracker['mean'] + + diff --git a/tests/variational/test_updates.py b/tests/variational/test_updates.py index 0ea4183a1..9b91dc12f 100644 --- a/tests/variational/test_updates.py +++ b/tests/variational/test_updates.py @@ -16,6 +16,7 @@ import pytensor import pytest +import pymc as pm from pymc.variational.updates import ( adadelta, adagrad, @@ -25,9 +26,22 @@ momentum, nesterov_momentum, rmsprop, - sgd, + sgd, exponential_decay_scheduler, reduce_lr_on_plateau_scheduler, ) +OPTIMIZERS = [sgd, + momentum, + nesterov_momentum, + adagrad, + rmsprop, + adadelta, + adam, + adamax, + adagrad_window] + +OPTIMIZER_NAMES = ["sgd", "momentum", "nesterov_momentum", "adagrad", "rmsprop", + "adadelta", "adam", "adamax", "adagrad_window"] + _a = pytensor.shared(1.0) _b = _a * 2 @@ -37,21 +51,7 @@ _n2 = _b + _n + _m2.sum() -@pytest.mark.parametrize( - "opt", - [sgd, momentum, nesterov_momentum, adagrad, rmsprop, adadelta, adam, adamax, adagrad_window], - ids=[ - "sgd", - "momentum", - "nesterov_momentum", - "adagrad", - "rmsprop", - "adadelta", - "adam", - "adamax", - "adagrad_window", - ], -) +@pytest.mark.parametrize("opt", OPTIMIZERS, ids=OPTIMIZER_NAMES) @pytest.mark.parametrize( "getter", [ @@ -93,3 +93,116 @@ def test_updates_fast(opt, loss_and_params, kwargs, getter): # Usual call to optimizer, old behaviour updates = opt(**args) assert isinstance(updates, dict) + + +@pytest.fixture() +def regression_model(): + rng = np.random.default_rng(1) + + X = rng.normal(size=(100,)) + intercept, coef = rng.normal(100, size=(2,)) + noise = rng.normal(size=(100,)) + y = intercept + coef * X + noise + + with pm.Model() as model: + a = pm.Normal("a", ) + b = pm.Normal("b") + + mu = a + b * X + pm.Normal("y", mu=mu, sigma=1, observed=y) + + return model + + +SCHEDULER_PARAMS = [(1, 0.5, 1, 1e-8, False), + (1, 0.5, 1, 1e-8, True), + (1, 0.5, 2, 1e-8, False), + (1, 0.5, 2, 1e-8, True)] +SCHEDULER_IDS = [f"initial_lr={x[0]}, decay_rate={x[1]}, decay_steps={x[2]}, min_lr={x[3]}, staircase={x[4]}" + for x in SCHEDULER_PARAMS] + + +@pytest.mark.parametrize("optimizer", OPTIMIZERS, ids=OPTIMIZER_NAMES) +@pytest.mark.parametrize('scheduler_args', SCHEDULER_PARAMS, ids=SCHEDULER_IDS) +def test_exponential_decay_scheduler(regression_model, optimizer, scheduler_args): + initial_lr, decay_rate, decay_steps, min_lr, staircase = scheduler_args + opt = optimizer(learning_rate=initial_lr) + scheduled_optimizer = exponential_decay_scheduler(opt, decay_steps, decay_rate, min_lr, staircase) + + with regression_model: + advi = pm.ADVI() + + updates = advi.objective.updates(obj_optimizer=scheduled_optimizer) + inputs = list(updates.keys()) + outputs = list(updates.values()) + + old_names = [x.name for x in inputs] + new_names = [x.name for x in outputs] + + assert all([expected_name in old_names for expected_name in ['learning_rate', 't']]) + assert all([expected_name in new_names for expected_name in ['learning_rate__updated', 't__updated']]) + + lr_idx = old_names.index('learning_rate') + t_idx = old_names.index('t') + + step_func = pytensor.function([], [outputs[lr_idx], outputs[t_idx]], + updates=updates, + mode='FAST_COMPILE') + + steps = np.vstack([step_func() for _ in range(10)]) + + def floor_div(x, y): + return x // y + + div_func = floor_div if staircase else np.divide + + expected_decay = np.maximum(initial_lr * decay_rate ** (div_func(np.arange(10), decay_steps)), min_lr) + + np.testing.assert_allclose(steps[:, 0], expected_decay) + np.testing.assert_allclose(steps[:, 1], np.arange(1, 11)) + + +def test_reduce_lr_on_plateau_scheduler(regression_model): + opt = pm.adam(learning_rate=1) + factor = 0.1 + patience = 10 + min_lr = 1e-6 + cooldown = 10 + scheduled_optimizer = reduce_lr_on_plateau_scheduler(opt, + factor=factor, patience=patience, + min_lr=min_lr, cooldown=cooldown) + with regression_model: + advi = pm.ADVI() + + updates = advi.objective.updates(obj_optimizer=scheduled_optimizer) + inputs = list(updates.keys()) + outputs = list(updates.values()) + + old_names = [x.name for x in inputs] + new_names = [x.name for x in outputs] + + expected_names = ['best_loss', 'cooldown_counter', 'wait', 'learning_rate'] + + assert all([expected_name in old_names for expected_name in expected_names]) + assert all([f'{expected_name}__updated' in new_names for expected_name in expected_names]) + + outputs_of_interest = [outputs[new_names.index(f'{expected_name}__updated')] for expected_name in + expected_names] + + tracker = pm.callbacks.Tracker(best_loss=outputs_of_interest[0].eval, + cooldown_counter=outputs_of_interest[1].eval, + wait=outputs_of_interest[2].eval, + learning_rate=outputs_of_interest[3].eval) + approx = advi.fit(1000, callbacks=[tracker], obj_optimizer=scheduled_optimizer) + + # Best loss only decreases + assert np.all(np.diff(np.stack(tracker.hist['best_loss'])) <= 0) + + # Learning_rate only decreases + assert np.all(np.diff(np.stack(tracker.hist['learning_rate'])) <= 0) + + # Wait is never greater than patience + assert np.all(np.stack(tracker.hist['wait']) <= patience) + + # Cooldown_counter is never greater than cooldown + assert np.all(np.stack(tracker.hist['cooldown_counter']) <= cooldown) From 56e77721ab9f98d241709191cda177cb49c59423 Mon Sep 17 00:00:00 2001 From: jessegrabowski Date: Sun, 3 Dec 2023 00:04:28 +0100 Subject: [PATCH 13/16] Implement schedulers as function wrappers --- tests/variational/test_callbacks.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 799102ef5..908fd6e9d 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -54,19 +54,19 @@ def test_tracker_callback(): tracker(None, None, 1) -OPTIMIZERS = [pm.sgd, - pm.momentum, - pm.nesterov_momentum, - pm.adagrad, - pm.rmsprop, - pm.adadelta, - pm.adam, - pm.adamax] +OPTIMIZERS = [pm.sgd(learning_rate=0.1), + pm.momentum(learning_rate=0.1), + pm.nesterov_momentum(learning_rate=0.1), + pm.adagrad(learning_rate=0.1), + pm.rmsprop(learning_rate=0.1), + pm.adadelta(learning_rate=0.1), + pm.adam(learning_rate=0.1), + pm.adamax(learning_rate=0.1),] @pytest.mark.parametrize("optimizer", OPTIMIZERS) def test_reduce_lr_on_plateau(optimizer): cb = pm.variational.callbacks.ReduceLROnPlateau( - optimizer=optimizer(learning_rate=0.1), + optimizer=optimizer, patience=1, min_lr=0.001, ) From 03953cd9a8e3e876bc9197b26796c781be38ab25 Mon Sep 17 00:00:00 2001 From: jessegrabowski Date: Sun, 3 Dec 2023 00:05:57 +0100 Subject: [PATCH 14/16] Implement schedulers as function wrappers --- tests/variational/test_callbacks.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 908fd6e9d..0fb2af191 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -110,30 +110,3 @@ def test_exponential_decay(optimizer): np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) cb(None, [float("inf"), 2, 2, 2, 2], 4) np.testing.assert_almost_equal(optimizer.keywords["learning_rate"], 0.001) - - -def test_learning_rate_scheduler(): - X = np.random.normal(size=(100, 5)) - beta = np.random.normal(size=(5,)) - alpha = np.random.normal() - noise = np.random.normal(size=(100,)) - y = alpha + X @ beta + noise - - optimizer = pm.sgd() - scheduler = pm.callbacks.ExponentialDecay(optimizer=optimizer, decay_steps=10, decay_rate=0.5) - - with pm.Model() as mod: - b = pm.Normal('b', shape=(5,)) - a = pm.Normal('a') - sigma = pm.HalfNormal('sigma') - - mu = a + X @ b - y_hat = pm.Normal('y_hat', mu, sigma, observed=y) - advi = pm.ADVI(optimizer=optimizer) - - tracker = pm.callbacks.Tracker(mean=advi.approx.mean.eval) - approx = advi.fit(100, callbacks=[scheduler, tracker]) - - mean_history = tracker['mean'] - - From 79e13044d6cafd15a0ff6a2c8bdece4002ab1803 Mon Sep 17 00:00:00 2001 From: jessegrabowski Date: Sun, 3 Dec 2023 00:21:17 +0100 Subject: [PATCH 15/16] Run pre-commit hooks --- pymc/variational/updates.py | 269 ++++++++++++++++------------ tests/variational/test_callbacks.py | 19 +- tests/variational/test_updates.py | 124 ++++++++----- 3 files changed, 241 insertions(+), 171 deletions(-) diff --git a/pymc/variational/updates.py b/pymc/variational/updates.py index bd47046b2..8f24b948d 100644 --- a/pymc/variational/updates.py +++ b/pymc/variational/updates.py @@ -176,7 +176,7 @@ def get_or_compute_grads(loss_or_grads, params): else: grads = pytensor.grad(loss_or_grads, params) for grad, param in zip(grads, params): - grad.name = f'd_loss/d_{param.name}' + grad.name = f"d_loss/d_{param.name}" return grads @@ -219,8 +219,8 @@ def _partial_initialization_wrapper(optimizer): @wraps(optimizer) def make_partial_if_necessary(*args, **kwargs): args = list(args) - loss_or_grads = _find_variable_among_args_kwargs(args, kwargs, 'loss_or_grads') - params = _find_variable_among_args_kwargs(args, kwargs, 'params') + loss_or_grads = _find_variable_among_args_kwargs(args, kwargs, "loss_or_grads") + params = _find_variable_among_args_kwargs(args, kwargs, "params") if loss_or_grads is None and params is None: return partial(optimizer, **kwargs) @@ -252,7 +252,9 @@ def _handle_loss_and_grad_input_wrapper(optimizer): """ @wraps(optimizer) - def discard_or_pass_through_loss_optimizer(loss_or_grads, params, discard_loss=True, *args, **kwargs): + def discard_or_pass_through_loss_optimizer( + loss_or_grads, params, discard_loss=True, *args, **kwargs + ): if isinstance(loss_or_grads, tuple): loss, grads = loss_or_grads updates = optimizer(loss_or_grads=grads, params=params, *args, **kwargs) @@ -313,7 +315,7 @@ def _sgd(loss_or_grads=None, params=None, *, learning_rate=1e-3): for param, grad in zip(params, grads): updated_param = param - learning_rate * grad - updated_param.name = f'{param.name}__updated' + updated_param.name = f"{param.name}__updated" updates[param] = updated_param return updates @@ -361,13 +363,15 @@ def apply_momentum(updates, params=None, momentum=0.9): for param in params: value = param.get_value(borrow=True) - velocity = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name='velocity') + velocity = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name="velocity" + ) updated_param = momentum * velocity + updates[param] - updated_param.name = f'{param}__updated' + updated_param.name = f"{param}__updated" updated_velocity = updated_param - param - updated_velocity.name = 'velocity__updated' + updated_velocity.name = "velocity__updated" updates[velocity] = updated_velocity updates[param] = updated_param @@ -483,10 +487,10 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9): velocity = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape) updated_velocity = momentum * velocity + updates[param] - param - updated_velocity.name = 'velocity__updated' + updated_velocity.name = "velocity__updated" updated_param = momentum * updated_velocity + updates[param] - updated_param.name = f'{param.name}__updated' + updated_param.name = f"{param.name}__updated" updates[velocity] = updated_velocity updates[param] = updated_param @@ -556,7 +560,9 @@ def _nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, mome return apply_nesterov_momentum(updates, momentum=momentum) -nesterov_momentum = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_nesterov_momentum)) +nesterov_momentum = _partial_initialization_wrapper( + _handle_loss_and_grad_input_wrapper(_nesterov_momentum) +) def _adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): @@ -624,15 +630,18 @@ def _adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): for param, grad in zip(params, grads): value = param.get_value(borrow=True) - accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, - name='gradient_squares') - accu_new = accu + grad ** 2 - accu_new.name = 'gradient_squares__updated' + accu = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), + shape=param.type.shape, + name="gradient_squares", + ) + accu_new = accu + grad**2 + accu_new.name = "gradient_squares__updated" updates[accu] = accu_new updated_param = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon)) - updated_param.name = f'{param.name}__updated' + updated_param.name = f"{param.name}__updated" updates[param] = updated_param @@ -667,19 +676,20 @@ def _adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilo grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): - i = pytensor.shared(pm.floatX(0), name='window_idx') + i = pytensor.shared(pm.floatX(0), name="window_idx") i_int = i.astype("int32") value = param.get_value(borrow=True) - accu = pytensor.shared(np.zeros(value.shape + (n_win,), dtype=value.dtype), - name='gradient_squares') + accu = pytensor.shared( + np.zeros(value.shape + (n_win,), dtype=value.dtype), name="gradient_squares" + ) # Append squared gradient vector to accu_new - accu_new = pt.set_subtensor(accu[..., i_int], grad ** 2) - accu_new.name = 'gradient_squares__updated' + accu_new = pt.set_subtensor(accu[..., i_int], grad**2) + accu_new.name = "gradient_squares__updated" i_new = pt.switch((i + 1) < n_win, i + 1, 0) - i_new.name = 'window_idx__updated' + i_new.name = "window_idx__updated" updates[accu] = accu_new updates[i] = i_new @@ -687,13 +697,15 @@ def _adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilo accu_sum = accu_new.sum(axis=-1) param_updated = param - (learning_rate * grad / pt.sqrt(accu_sum + epsilon)) - param_updated.name = f'{param.name}__updated' + param_updated.name = f"{param.name}__updated" updates[param] = param_updated return updates -adagrad_window = _partial_initialization_wrapper(_handle_loss_and_grad_input_wrapper(_adagrad_window)) +adagrad_window = _partial_initialization_wrapper( + _handle_loss_and_grad_input_wrapper(_adagrad_window) +) def _rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon=1e-6): @@ -765,16 +777,19 @@ def _rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilo for param, grad in zip(params, grads): value = param.get_value(borrow=True) - accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, - name='gradient_squares') + accu = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), + shape=param.type.shape, + name="gradient_squares", + ) - accu_new = rho * accu + (one - rho) * grad ** 2 - accu_new.name = 'gradient_squares__updated' + accu_new = rho * accu + (one - rho) * grad**2 + accu_new.name = "gradient_squares__updated" updates[accu] = accu_new param_updated = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon)) - param_updated.name = f'{param.name}__updated' + param_updated.name = f"{param.name}__updated" updates[param] = param_updated return updates @@ -863,15 +878,18 @@ def _adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsi value = param.get_value(borrow=True) # accu: accumulate gradient magnitudes - accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, - name='gradient_squares') + accu = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), + shape=param.type.shape, + name="gradient_squares", + ) # delta_accu: accumulate update magnitudes (recursively!) delta_accu = pytensor.shared( - np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name='' + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name="" ) # update accu (as in rmsprop) - accu_new = rho * accu + (one - rho) * grad ** 2 + accu_new = rho * accu + (one - rho) * grad**2 updates[accu] = accu_new # compute parameter update, using the 'old' delta_accu @@ -879,7 +897,7 @@ def _adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsi updates[param] = param - learning_rate * update # update delta_accu (as accu, but accumulating updates) - delta_accu_new = rho * delta_accu + (one - rho) * update ** 2 + delta_accu_new = rho * delta_accu + (one - rho) * update**2 updates[delta_accu] = delta_accu_new return updates @@ -889,7 +907,7 @@ def _adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsi def _adam( - loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8 + loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8 ): """Adam updates @@ -945,36 +963,40 @@ def _adam( True """ all_grads = get_or_compute_grads(loss_or_grads, params) - t_prev = pytensor.shared(pm.pytensorf.floatX(0.0), name='t') + t_prev = pytensor.shared(pm.pytensorf.floatX(0.0), name="t") updates = OrderedDict() # Using pytensor constant to prevent upcasting of float32 one = pt.constant(1) t = t_prev + 1 - t.name = 't__updated' - a_t = learning_rate * pt.sqrt(one - beta2 ** t) / (one - beta1 ** t) - a_t.name = 'a' + t.name = "t__updated" + a_t = learning_rate * pt.sqrt(one - beta2**t) / (one - beta1**t) + a_t.name = "a" for param, g_t in zip(params, all_grads): name = param.name value = param.get_value(borrow=True) - m_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_m') - v_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_v') + m_prev = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f"{name}_m" + ) + v_prev = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f"{name}_v" + ) m_t = beta1 * m_prev + (one - beta1) * g_t - m_t.name = f'{name}_m__updated' - v_t = beta2 * v_prev + (one - beta2) * g_t ** 2 - v_t.name = f'{name}_v__updated' + m_t.name = f"{name}_m__updated" + v_t = beta2 * v_prev + (one - beta2) * g_t**2 + v_t.name = f"{name}_v__updated" step = a_t * m_t / (pt.sqrt(v_t) + epsilon) - step.name = f'{name}_step_size' + step.name = f"{name}_step_size" updates[m_prev] = m_t updates[v_prev] = v_t param_updated = param - step - param_updated.name = f'{name}__updated' + param_updated.name = f"{name}__updated" updates[param] = param_updated updates[t_prev] = t @@ -985,7 +1007,7 @@ def _adam( def _adamax( - loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8 + loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8 ): """Adamax updates @@ -1038,38 +1060,42 @@ def _adamax( True """ all_grads = get_or_compute_grads(loss_or_grads, params) - t_prev = pytensor.shared(pm.pytensorf.floatX(0.0), name='t') + t_prev = pytensor.shared(pm.pytensorf.floatX(0.0), name="t") updates = OrderedDict() # Using pytensor constant to prevent upcasting of float32 one = pt.constant(1) t = t_prev + 1 - t.name = 't__updated' + t.name = "t__updated" - a_t = learning_rate / (one - beta1 ** t) - a_t.name = 'a' + a_t = learning_rate / (one - beta1**t) + a_t.name = "a" for param, g_t in zip(params, all_grads): name = param.name value = param.get_value(borrow=True) - m_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_m') - u_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f'{name}_u') + m_prev = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f"{name}_m" + ) + u_prev = pytensor.shared( + np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape, name=f"{name}_u" + ) m_t = beta1 * m_prev + (one - beta1) * g_t - m_t.name = f'{name}_m__updated' + m_t.name = f"{name}_m__updated" u_t = pt.maximum(beta2 * u_prev, abs(g_t)) - u_t.name = f'{name}_u__updated' + u_t.name = f"{name}_u__updated" step = a_t * m_t / (u_t + epsilon) - step.name = f'{name}_step_size' + step.name = f"{name}_step_size" updates[m_prev] = m_t updates[u_prev] = u_t param_updated = param - step - param_updated.name = f'{name}__updated' + param_updated.name = f"{name}__updated" updates[param] = param_updated updates[t_prev] = t @@ -1212,7 +1238,7 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False learning with neural networks. In Advances in Neural Information Processing Systems (pp. 3104-3112). """ - norm = pt.sqrt(sum(pt.sum(tensor ** 2) for tensor in tensor_vars)) + norm = pt.sqrt(sum(pt.sum(tensor**2) for tensor in tensor_vars)) dtype = np.dtype(pytensor.config.floatX).type target_norm = pt.clip(norm, 0, dtype(max_norm)) multiplier = target_norm / (dtype(epsilon) + norm) @@ -1250,11 +1276,11 @@ def _handle_time_updates(updates): old_values = list(updates.keys()) old_names = [shared_var.name for shared_var in old_values] - t_idx = old_names.index('t') if 't' in old_names else None + t_idx = old_names.index("t") if "t" in old_names else None if t_idx is None: - t = pytensor.shared(pm.pytensorf.floatX(0.0), name='t') + t = pytensor.shared(pm.pytensorf.floatX(0.0), name="t") new_t = t + 1 - new_t.name = 't__updated' + new_t.name = "t__updated" else: # If t is already present, we will reuse it, but we also need to delete it from the update dict temporarily. # We always want it to be the last update applied. @@ -1265,8 +1291,13 @@ def _handle_time_updates(updates): return t, new_t -def exponential_decay_scheduler(optimizer: Callable, decay_steps: int, decay_rate: float, min_lr: float = 1e-6, - staircase: bool = False): +def exponential_decay_scheduler( + optimizer: Callable, + decay_steps: int, + decay_rate: float, + min_lr: float = 1e-6, + staircase: bool = False, +): """ Returns a new optimizer that applies exponential decay to the learning rate. @@ -1291,14 +1322,14 @@ def exponential_decay_scheduler(optimizer: Callable, decay_steps: int, decay_rat Optimizer with exponential decay applied to learning rate. """ if not 0 < decay_rate <= 1: - raise ValueError('decay_rate must be between 0 and 1') + raise ValueError("decay_rate must be between 0 and 1") kwargs = optimizer.keywords - _initial_lr = pm.floatX(optimizer.keywords['learning_rate']) + _initial_lr = pm.floatX(optimizer.keywords["learning_rate"]) - initial_lr = pt.constant(_initial_lr, name='initial_learning_rate') - shared_lr = _input_to_shared_variable(_initial_lr, 'learning_rate') - kwargs['learning_rate'] = shared_lr + initial_lr = pt.constant(_initial_lr, name="initial_learning_rate") + shared_lr = _input_to_shared_variable(_initial_lr, "learning_rate") + kwargs["learning_rate"] = shared_lr @wraps(optimizer) def optimizer_with_exponential_decay(loss_or_grads, params, *args, **kwargs): @@ -1312,7 +1343,7 @@ def optimizer_with_exponential_decay(loss_or_grads, params, *args, **kwargs): new_lr = pt.maximum(new_lr, min_lr) - new_lr.name = 'learning_rate__updated' + new_lr.name = "learning_rate__updated" updates[shared_lr] = new_lr updates[t] = new_t @@ -1323,17 +1354,17 @@ def optimizer_with_exponential_decay(loss_or_grads, params, *args, **kwargs): def reduce_lr_on_plateau_scheduler(optimizer, factor=0.1, patience=10, min_lr=1e-6, cooldown=0): kwargs = optimizer.keywords - _initial_lr = pm.floatX(optimizer.keywords['learning_rate']) - shared_lr = _input_to_shared_variable(_initial_lr, 'learning_rate') - kwargs['learning_rate'] = shared_lr + _initial_lr = pm.floatX(optimizer.keywords["learning_rate"]) + shared_lr = _input_to_shared_variable(_initial_lr, "learning_rate") + kwargs["learning_rate"] = shared_lr @wraps(optimizer) def optimizer_with_reduce_lr_on_plateau(loss_or_grads, params, *args, **kwargs): updates, loss = optimizer(loss_or_grads, params, *args, discard_loss=False, **kwargs) - cooldown_counter = pytensor.shared(np.zeros((), dtype='int32'), name='cooldown_counter') - wait = pytensor.shared(np.zeros((), dtype='int32'), name='wait') - best_loss = pytensor.shared(np.inf, name='best_loss') + cooldown_counter = pytensor.shared(np.zeros((), dtype="int32"), name="cooldown_counter") + wait = pytensor.shared(np.zeros((), dtype="int32"), name="wait") + best_loss = pytensor.shared(np.inf, name="best_loss") loss_is_inf = pt.isinf(loss) @@ -1341,47 +1372,53 @@ def optimizer_with_reduce_lr_on_plateau(loss_or_grads, params, *args, **kwargs): improving_loss = pt.lt(loss, best_loss) patience_exceeded = pt.ge(wait, patience) - updated_best_loss = pt.switch(loss_is_inf, - best_loss, - pt.switch(improving_loss, - loss, - best_loss)) - - updated_best_loss.name = 'best_loss__updated' - - updated_cooldown_counter = pt.switch(loss_is_inf, - cooldown_counter, - pt.switch(in_cooldown, - cooldown_counter - 1, - pt.switch(improving_loss, - cooldown_counter, - pt.switch(patience_exceeded, - cooldown, - cooldown_counter)))) - updated_cooldown_counter.name = 'cooldown_counter__updated' - - updated_lr = pt.switch(loss_is_inf, - shared_lr, - pt.switch(in_cooldown, - shared_lr, - pt.switch(improving_loss, - shared_lr, - pt.switch(patience_exceeded, - pt.maximum(min_lr, shared_lr * factor), - shared_lr)))) - - updated_lr.name = 'learning_rate__updated' - - updated_wait = pt.switch(loss_is_inf, - wait, - pt.switch(in_cooldown, - 0, - pt.switch(improving_loss, - 0, - pt.switch(patience_exceeded, - 0, - wait + 1)))) - updated_wait.name = 'wait__updated' + updated_best_loss = pt.switch( + loss_is_inf, best_loss, pt.switch(improving_loss, loss, best_loss) + ) + + updated_best_loss.name = "best_loss__updated" + + updated_cooldown_counter = pt.switch( + loss_is_inf, + cooldown_counter, + pt.switch( + in_cooldown, + cooldown_counter - 1, + pt.switch( + improving_loss, + cooldown_counter, + pt.switch(patience_exceeded, cooldown, cooldown_counter), + ), + ), + ) + updated_cooldown_counter.name = "cooldown_counter__updated" + + updated_lr = pt.switch( + loss_is_inf, + shared_lr, + pt.switch( + in_cooldown, + shared_lr, + pt.switch( + improving_loss, + shared_lr, + pt.switch(patience_exceeded, pt.maximum(min_lr, shared_lr * factor), shared_lr), + ), + ), + ) + + updated_lr.name = "learning_rate__updated" + + updated_wait = pt.switch( + loss_is_inf, + wait, + pt.switch( + in_cooldown, + 0, + pt.switch(improving_loss, 0, pt.switch(patience_exceeded, 0, wait + 1)), + ), + ) + updated_wait.name = "wait__updated" updates[best_loss] = updated_best_loss updates[cooldown_counter] = updated_cooldown_counter diff --git a/tests/variational/test_callbacks.py b/tests/variational/test_callbacks.py index 0fb2af191..9649dc228 100644 --- a/tests/variational/test_callbacks.py +++ b/tests/variational/test_callbacks.py @@ -54,14 +54,17 @@ def test_tracker_callback(): tracker(None, None, 1) -OPTIMIZERS = [pm.sgd(learning_rate=0.1), - pm.momentum(learning_rate=0.1), - pm.nesterov_momentum(learning_rate=0.1), - pm.adagrad(learning_rate=0.1), - pm.rmsprop(learning_rate=0.1), - pm.adadelta(learning_rate=0.1), - pm.adam(learning_rate=0.1), - pm.adamax(learning_rate=0.1),] +OPTIMIZERS = [ + pm.sgd(learning_rate=0.1), + pm.momentum(learning_rate=0.1), + pm.nesterov_momentum(learning_rate=0.1), + pm.adagrad(learning_rate=0.1), + pm.rmsprop(learning_rate=0.1), + pm.adadelta(learning_rate=0.1), + pm.adam(learning_rate=0.1), + pm.adamax(learning_rate=0.1), +] + @pytest.mark.parametrize("optimizer", OPTIMIZERS) def test_reduce_lr_on_plateau(optimizer): diff --git a/tests/variational/test_updates.py b/tests/variational/test_updates.py index 9b91dc12f..f670a6e99 100644 --- a/tests/variational/test_updates.py +++ b/tests/variational/test_updates.py @@ -17,30 +17,44 @@ import pytest import pymc as pm + from pymc.variational.updates import ( adadelta, adagrad, adagrad_window, adam, adamax, + exponential_decay_scheduler, momentum, nesterov_momentum, + reduce_lr_on_plateau_scheduler, rmsprop, - sgd, exponential_decay_scheduler, reduce_lr_on_plateau_scheduler, + sgd, ) -OPTIMIZERS = [sgd, - momentum, - nesterov_momentum, - adagrad, - rmsprop, - adadelta, - adam, - adamax, - adagrad_window] - -OPTIMIZER_NAMES = ["sgd", "momentum", "nesterov_momentum", "adagrad", "rmsprop", - "adadelta", "adam", "adamax", "adagrad_window"] +OPTIMIZERS = [ + sgd, + momentum, + nesterov_momentum, + adagrad, + rmsprop, + adadelta, + adam, + adamax, + adagrad_window, +] + +OPTIMIZER_NAMES = [ + "sgd", + "momentum", + "nesterov_momentum", + "adagrad", + "rmsprop", + "adadelta", + "adam", + "adamax", + "adagrad_window", +] _a = pytensor.shared(1.0) _b = _a * 2 @@ -105,7 +119,9 @@ def regression_model(): y = intercept + coef * X + noise with pm.Model() as model: - a = pm.Normal("a", ) + a = pm.Normal( + "a", + ) b = pm.Normal("b") mu = a + b * X @@ -114,20 +130,26 @@ def regression_model(): return model -SCHEDULER_PARAMS = [(1, 0.5, 1, 1e-8, False), - (1, 0.5, 1, 1e-8, True), - (1, 0.5, 2, 1e-8, False), - (1, 0.5, 2, 1e-8, True)] -SCHEDULER_IDS = [f"initial_lr={x[0]}, decay_rate={x[1]}, decay_steps={x[2]}, min_lr={x[3]}, staircase={x[4]}" - for x in SCHEDULER_PARAMS] +SCHEDULER_PARAMS = [ + (1, 0.5, 1, 1e-8, False), + (1, 0.5, 1, 1e-8, True), + (1, 0.5, 2, 1e-8, False), + (1, 0.5, 2, 1e-8, True), +] +SCHEDULER_IDS = [ + f"initial_lr={x[0]}, decay_rate={x[1]}, decay_steps={x[2]}, min_lr={x[3]}, staircase={x[4]}" + for x in SCHEDULER_PARAMS +] @pytest.mark.parametrize("optimizer", OPTIMIZERS, ids=OPTIMIZER_NAMES) -@pytest.mark.parametrize('scheduler_args', SCHEDULER_PARAMS, ids=SCHEDULER_IDS) +@pytest.mark.parametrize("scheduler_args", SCHEDULER_PARAMS, ids=SCHEDULER_IDS) def test_exponential_decay_scheduler(regression_model, optimizer, scheduler_args): initial_lr, decay_rate, decay_steps, min_lr, staircase = scheduler_args opt = optimizer(learning_rate=initial_lr) - scheduled_optimizer = exponential_decay_scheduler(opt, decay_steps, decay_rate, min_lr, staircase) + scheduled_optimizer = exponential_decay_scheduler( + opt, decay_steps, decay_rate, min_lr, staircase + ) with regression_model: advi = pm.ADVI() @@ -139,15 +161,17 @@ def test_exponential_decay_scheduler(regression_model, optimizer, scheduler_args old_names = [x.name for x in inputs] new_names = [x.name for x in outputs] - assert all([expected_name in old_names for expected_name in ['learning_rate', 't']]) - assert all([expected_name in new_names for expected_name in ['learning_rate__updated', 't__updated']]) + assert all([expected_name in old_names for expected_name in ["learning_rate", "t"]]) + assert all( + [expected_name in new_names for expected_name in ["learning_rate__updated", "t__updated"]] + ) - lr_idx = old_names.index('learning_rate') - t_idx = old_names.index('t') + lr_idx = old_names.index("learning_rate") + t_idx = old_names.index("t") - step_func = pytensor.function([], [outputs[lr_idx], outputs[t_idx]], - updates=updates, - mode='FAST_COMPILE') + step_func = pytensor.function( + [], [outputs[lr_idx], outputs[t_idx]], updates=updates, mode="FAST_COMPILE" + ) steps = np.vstack([step_func() for _ in range(10)]) @@ -156,7 +180,9 @@ def floor_div(x, y): div_func = floor_div if staircase else np.divide - expected_decay = np.maximum(initial_lr * decay_rate ** (div_func(np.arange(10), decay_steps)), min_lr) + expected_decay = np.maximum( + initial_lr * decay_rate ** (div_func(np.arange(10), decay_steps)), min_lr + ) np.testing.assert_allclose(steps[:, 0], expected_decay) np.testing.assert_allclose(steps[:, 1], np.arange(1, 11)) @@ -168,9 +194,9 @@ def test_reduce_lr_on_plateau_scheduler(regression_model): patience = 10 min_lr = 1e-6 cooldown = 10 - scheduled_optimizer = reduce_lr_on_plateau_scheduler(opt, - factor=factor, patience=patience, - min_lr=min_lr, cooldown=cooldown) + scheduled_optimizer = reduce_lr_on_plateau_scheduler( + opt, factor=factor, patience=patience, min_lr=min_lr, cooldown=cooldown + ) with regression_model: advi = pm.ADVI() @@ -181,28 +207,32 @@ def test_reduce_lr_on_plateau_scheduler(regression_model): old_names = [x.name for x in inputs] new_names = [x.name for x in outputs] - expected_names = ['best_loss', 'cooldown_counter', 'wait', 'learning_rate'] + expected_names = ["best_loss", "cooldown_counter", "wait", "learning_rate"] assert all([expected_name in old_names for expected_name in expected_names]) - assert all([f'{expected_name}__updated' in new_names for expected_name in expected_names]) - - outputs_of_interest = [outputs[new_names.index(f'{expected_name}__updated')] for expected_name in - expected_names] - - tracker = pm.callbacks.Tracker(best_loss=outputs_of_interest[0].eval, - cooldown_counter=outputs_of_interest[1].eval, - wait=outputs_of_interest[2].eval, - learning_rate=outputs_of_interest[3].eval) + assert all([f"{expected_name}__updated" in new_names for expected_name in expected_names]) + + outputs_of_interest = [ + outputs[new_names.index(f"{expected_name}__updated")] + for expected_name in expected_names + ] + + tracker = pm.callbacks.Tracker( + best_loss=outputs_of_interest[0].eval, + cooldown_counter=outputs_of_interest[1].eval, + wait=outputs_of_interest[2].eval, + learning_rate=outputs_of_interest[3].eval, + ) approx = advi.fit(1000, callbacks=[tracker], obj_optimizer=scheduled_optimizer) # Best loss only decreases - assert np.all(np.diff(np.stack(tracker.hist['best_loss'])) <= 0) + assert np.all(np.diff(np.stack(tracker.hist["best_loss"])) <= 0) # Learning_rate only decreases - assert np.all(np.diff(np.stack(tracker.hist['learning_rate'])) <= 0) + assert np.all(np.diff(np.stack(tracker.hist["learning_rate"])) <= 0) # Wait is never greater than patience - assert np.all(np.stack(tracker.hist['wait']) <= patience) + assert np.all(np.stack(tracker.hist["wait"]) <= patience) # Cooldown_counter is never greater than cooldown - assert np.all(np.stack(tracker.hist['cooldown_counter']) <= cooldown) + assert np.all(np.stack(tracker.hist["cooldown_counter"]) <= cooldown) From d375bf0427851b8d654b8fea64204bb67acee71a Mon Sep 17 00:00:00 2001 From: jessegrabowski Date: Sun, 3 Dec 2023 00:27:03 +0100 Subject: [PATCH 16/16] Change typehint from `Callable` to `partial` for mypy --- pymc/variational/updates.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pymc/variational/updates.py b/pymc/variational/updates.py index 8f24b948d..31bc5a96d 100644 --- a/pymc/variational/updates.py +++ b/pymc/variational/updates.py @@ -110,7 +110,6 @@ """ from collections import OrderedDict from functools import partial, wraps -from typing import Callable import numpy as np import pytensor @@ -1292,7 +1291,7 @@ def _handle_time_updates(updates): def exponential_decay_scheduler( - optimizer: Callable, + optimizer: partial, decay_steps: int, decay_rate: float, min_lr: float = 1e-6,