update reinforce synapse and test cases

rxng8 · rxng8 · commit c3f39f1d4957 · 2025-04-05T22:17:36.000-04:00
diff --git a/ngclearn/components/synapses/modulated/REINFORCESynapse.py b/ngclearn/components/synapses/modulated/REINFORCESynapse.py
@@ -10,6 +10,11 @@
 from ngclearn.utils import tensorstats
 from ngclearn.utils.model_utils import create_function
 
+def gaussian_logpdf(event, mean, stddev):
+    scale_sqrd = stddev ** 2
+    log_normalizer = jnp.log(2 * jnp.pi * scale_sqrd)
+    quadratic = (jax.lax.stop_gradient(event - 2 * mean) + mean)**2 / scale_sqrd
+    return - 0.5 * (log_normalizer + quadratic)
 
 class REINFORCESynapse(DenseSynapse):
 
@@ -39,6 +44,8 @@ def __init__(
         # self.seed = Component(seed)
         self.accumulated_gradients = Compartment(jnp.zeros((input_dim, output_dim * 2)))
         self.decay = decay
+        self.step_count = Compartment(jnp.zeros(()))
+        self.learning_mask = Compartment(jnp.zeros(()))
 
     @staticmethod
     def _compute_update(dt, inputs, rewards, act_fx, weights):
@@ -53,41 +60,44 @@ def _compute_update(dt, inputs, rewards, act_fx, weights):
         sample = epsilon * std + mean
         outputs = sample # the actual action that we take
         # Compute log probability density of the Gaussian
-        log_prob = -0.5 * jnp.log(2 * jnp.pi) - logstd - 0.5 * ((sample - mean) / std) ** 2
+        log_prob = gaussian_logpdf(sample, mean, std)
         log_prob = log_prob.sum(-1)
         # Compute objective (negative REINFORCE objective)
         objective = (-log_prob * rewards).mean() * 1e-2
         # Backward pass
         # Compute gradients manually based on the derivation
-        # dL/dmu = -(r-r_hat) * dlog_prob/dmu = -(r-r_hat) * (sample-mu)/sigma^2
-        dlog_prob_dmean = (sample - mean) / (std ** 2)
+        # dL/dmu = -(r-r_hat) * dlog_prob/dmu = -(r-r_hat) * -(sample-mu)/sigma^2
+        dlog_prob_dmean = -(sample - mean) / (std ** 2)
         # dL/dlog(sigma) = -(r-r_hat) * dlog_prob/dlog(sigma) = -(r-r_hat) * (((sample-mu)/sigma)^2 - 1)
         dlog_prob_dlogstd = ((sample - mean) / std) ** 2 - 1.0
         # Compute gradients with respect to weights
         # Using chain rule: dL/dW_mu = dL/dmu * dmu/dW_mu = dL/dmu * activation^T
         # Similarly for W_logstd
-        dL_dWmu = activation.T @ (-rewards[:, None] * dlog_prob_dmean) * 1e-2
-        dL_dWlstd = activation.T @ (-rewards[:, None] * dlog_prob_dlogstd) * 1e-2
+        # Gradient ascent instead of descent
+        dL_dWmu = activation.T @ (rewards[:, None] * dlog_prob_dmean) * 1e-2
+        dL_dWlstd = activation.T @ (rewards[:, None] * dlog_prob_dlogstd) * 1e-2
         # Update weights
         dW = jnp.concatenate([dL_dWmu, dL_dWlstd], axis=-1)
         # Finally, return metrics if needed
         return dW, objective, outputs
 
-    @transition(output_compartments=["weights", "dWeights", "objective", "outputs", "accumulated_gradients"])
+    @transition(output_compartments=["weights", "dWeights", "objective", "outputs", "accumulated_gradients", "step_count"])
     @staticmethod
-    def evolve(dt, w_bound, inputs, rewards, act_fx, weights, eta, decay, accumulated_gradients):
+    def evolve(dt, w_bound, inputs, rewards, act_fx, weights, eta, learning_mask, decay, accumulated_gradients, step_count):
         dWeights, objective, outputs = REINFORCESynapse._compute_update(
             dt, inputs, rewards, act_fx, weights
         )
         ## do a gradient ascent update/shift
-        weights = weights + dWeights * eta
+        weights = (weights + dWeights * eta) * learning_mask + weights * (1.0 - learning_mask) # update the weights only where learning_mask is 1.0
         ## enforce non-negativity
-        eps = 0.01 # 0.001
+        eps = 0.0 # 0.01 # 0.001
         weights = jnp.clip(weights, eps, w_bound - eps)  # jnp.abs(w_bound))
-        accumulated_gradients = accumulated_gradients * decay + dWeights
-        return weights, dWeights, objective, outputs, accumulated_gradients
+        step_count += 1
+        accumulated_gradients = (step_count - 1) / step_count * accumulated_gradients * decay + 1.0 / step_count * dWeights # EMA update of accumulated gradients
+        step_count = step_count * (1 - learning_mask) # reset the step count to 0 when we have learned
+        return weights, dWeights, objective, outputs, accumulated_gradients, step_count
 
-    @transition(output_compartments=["inputs", "outputs", "objective", "rewards", "dWeights", "accumulated_gradients"])
+    @transition(output_compartments=["inputs", "outputs", "objective", "rewards", "dWeights", "accumulated_gradients", "step_count"])
     @staticmethod
     def reset(batch_size, shape):
         preVals = jnp.zeros((batch_size, shape[0]))
@@ -98,7 +108,8 @@ def reset(batch_size, shape):
         rewards = jnp.zeros((batch_size,))
         dWeights = jnp.zeros(shape)
         accumulated_gradients = jnp.zeros((shape[0], shape[1] * 2))
-        return inputs, outputs, objective, rewards, dWeights, accumulated_gradients
+        step_count = jnp.zeros(())
+        return inputs, outputs, objective, rewards, dWeights, accumulated_gradients, step_count
 
     @classmethod
     def help(cls): ## component help function
diff --git a/tests/components/synapses/modulated/test_REINFORCESynapse.py b/tests/components/synapses/modulated/test_REINFORCESynapse.py
@@ -5,7 +5,7 @@
 from ngcsimlib.context import Context
 import numpy as np
 np.random.seed(42)
-from ngclearn.components.synapses.modulated.REINFORCESynapse import REINFORCESynapse
+from ngclearn.components.synapses.modulated.REINFORCESynapse import REINFORCESynapse, gaussian_logpdf
 from ngcsimlib.compilers import compile_command, wrap_command
 from numpy.testing import assert_array_equal
 
@@ -52,42 +52,53 @@ def clamp_weights(x):
 
     ## check pre-synaptic STDP only
     # truth = jnp.array([[1.25]])
+    np.random.seed(42)
     ctx.reset()
-    clamp_weights(jnp.ones((1, 2)))
-    clamp_rewards(jnp.ones((1, 1)))
-    clamp_inputs(jnp.ones((1, 1)))
+    clamp_weights(jnp.ones((1, 2)) * 2)
+    clamp_rewards(jnp.ones((1, 1)) * 3)
+    clamp_inputs(jnp.ones((1, 1)) * 0.5)
     ctx.adapt(t=1., dt=dt)
     # assert_array_equal(a.dWeights.value, truth)
-    print(a.dWeights.value)
-
+    print(f"weights: {a.weights.value}")
+    print(f"dWeights: {a.dWeights.value}")
+    print(f"step_count: {a.step_count.value}")
+    print(f"accumulated_gradients: {a.accumulated_gradients.value}")
+    print(f"objective: {a.objective.value}")
 
+    np.random.seed(42)
     # JAX Grad output
     _act = jax.nn.tanh
     def fn(params: dict, inputs: jax.Array, outputs: jax.Array, seed):
         W_mu, W_logstd = params
-        mean = _act(inputs) @ W_mu
-        logstd = _act(inputs) @ W_logstd
+        activation = _act(inputs)
+        mean = activation @ W_mu
+        logstd = activation @ W_logstd
         std = jnp.exp(logstd.clip(-10.0, 2.0))
-        sample = jax.random.normal(seed, mean.shape) * std + mean
-        # logp = gaussian_logpdf(sample, mean, std).sum(-1)
-        logp = jax.scipy.stats.norm.logpdf(sample, mean, std).sum(-1)
+        # sample = jax.random.normal(seed, mean.shape) * std + mean
+        sample = jnp.asarray(np.random.normal(0, 1, mean.shape)) * std + mean
+        logp = gaussian_logpdf(sample, mean, std).sum(-1)
+        # logp = jax.scipy.stats.norm.logpdf(sample, mean, std).sum(-1)
         return (-logp * outputs).mean() * 1e-2
     grad_fn = jax.value_and_grad(fn)
 
-    weights_mu = jnp.ones((1, 1))
-    weights_logstd = jnp.ones((1, 1))
-    inputs = jnp.ones((1, 1))
-    outputs = jnp.ones((1, 1))
+    weights_mu = jnp.ones((1, 1)) * 2
+    weights_logstd = jnp.ones((1, 1)) * 2
+    inputs = jnp.ones((1, 1)) * 0.5
+    outputs = jnp.ones((1, 1)) * 3 # reward
     objective, grads = grad_fn(
         (weights_mu, weights_logstd),
         inputs,
         outputs,
         jax.random.key(42)
     )
+    print(f"expected grads: {grads}")
+    print(f"expected objective: {objective}")
     np.testing.assert_allclose(
         a.dWeights.value[0],
-        jnp.concatenate([grads[0], grads[1]], axis=-1),
-        atol=1e-2
+        # NOTE: Viet: negate the gradient because gradient in ngc-learn
+        #   is gradient ascent, while gradient in JAX is gradient descent
+        -jnp.concatenate([grads[0], grads[1]], axis=-1),
+        atol=1e-8
     ) # NOTE: gradient is not exact due to different gradient computation, we need to inspect more closely
 
-# test_REINFORCESynapse1()
+test_REINFORCESynapse1()