update working reinforce synapse

rxng8 · rxng8 · commit e7f482b9367a · 2025-04-10T19:51:49.000-04:00
diff --git a/ngclearn/components/synapses/modulated/REINFORCESynapse.py b/ngclearn/components/synapses/modulated/REINFORCESynapse.py
@@ -11,17 +11,18 @@
 from ngclearn.utils.model_utils import create_function
 
 def gaussian_logpdf(event, mean, stddev):
-    scale_sqrd = stddev ** 2
-    log_normalizer = jnp.log(2 * jnp.pi * scale_sqrd)
-    quadratic = (jax.lax.stop_gradient(event - 2 * mean) + mean)**2 / scale_sqrd
-    return - 0.5 * (log_normalizer + quadratic)
+    # scale_sqrd = stddev ** 2
+    # log_normalizer = jnp.log(2 * jnp.pi * scale_sqrd)
+    # quadratic = (jax.lax.stop_gradient(event - 2 * mean) + mean)**2 / scale_sqrd
+    # return - 0.5 * (log_normalizer + quadratic)
+    return -0.5 * jnp.log(2 * jnp.pi) - jnp.log(stddev) - 0.5 * (  (jax.lax.stop_gradient(event - 2 * mean) + mean) / stddev  )**2
 
 class REINFORCESynapse(DenseSynapse):
 
     # Define Functions
     def __init__(
             self, name, shape, eta=1e-4, decay=0.99, weight_init=None, resist_scale=1., act_fx=None,
-            p_conn=1., w_bound=1., batch_size=1, **kwargs
+            p_conn=1., w_bound=1., batch_size=1, seed=None, **kwargs
     ):
         # This is because we have weights mu and weight log sigma
         input_dim, output_dim = shape
@@ -46,48 +47,61 @@ def __init__(
         self.decay = decay
         self.step_count = Compartment(jnp.zeros(()))
         self.learning_mask = Compartment(jnp.zeros(()))
+        # self.seed = Component(jnp.array(seed) if seed is not None else jnp.array(42, dtype=jnp.int32))
+        self.seed = Compartment(jax.random.PRNGKey(seed if seed is not None else 42))
 
     @staticmethod
-    def _compute_update(dt, inputs, rewards, act_fx, weights):
+    def _compute_update(dt, inputs, rewards, act_fx, weights, seed):
         W_mu, W_logstd = jnp.split(weights, 2, axis=-1) # (input_dim, output_dim * 2) => (input_dim, output_dim), (input_dim, output_dim)
         # Forward pass
         activation = act_fx(inputs)
         mean = activation @ W_mu
         logstd = activation @ W_logstd
-        std = jnp.exp(logstd.clip(-10.0, 2.0))
+        clip_logstd = jnp.clip(logstd, -10.0, 2.0)
+        std = jnp.exp(clip_logstd)
         # Sample using reparameterization trick
-        epsilon = jnp.asarray(np.random.normal(0, 1, mean.shape))
+        epsilon = jax.random.normal(seed, mean.shape)
         sample = epsilon * std + mean
         outputs = sample # the actual action that we take
         # Compute log probability density of the Gaussian
-        log_prob = gaussian_logpdf(sample, mean, std)
-        log_prob = log_prob.sum(-1)
+        log_prob = gaussian_logpdf(sample, mean, std).sum(-1)
         # Compute objective (negative REINFORCE objective)
         objective = (-log_prob * rewards).mean() * 1e-2
+
         # Backward pass
+        batch_size = inputs.shape[0] # B
+        dL_dlogp = -rewards[:, None] * 1e-2 / batch_size # (B, 1)
+
         # Compute gradients manually based on the derivation
         # dL/dmu = -(r-r_hat) * dlog_prob/dmu = -(r-r_hat) * -(sample-mu)/sigma^2
         # -(sample - mean) instead of (sample - mean) because we are doing straight-through gradient in the log_prob function
         # therefore, computation including the mean in such function does not contribute to the gradient
         dlog_prob_dmean = -(sample - mean) / (std ** 2)
+        dL_dmean = dL_dlogp * dlog_prob_dmean # (B, A)
+        dL_dWmu = activation.T @ dL_dmean
+
         # dL/dlog(sigma) = -(r-r_hat) * dlog_prob/dlog(sigma) = -(r-r_hat) * (((sample-mu)/sigma)^2 - 1)
-        dlog_prob_dlogstd = ((sample - mean) / std) ** 2 - 1.0
-        # Compute gradients with respect to weights
-        # Using chain rule: dL/dW_mu = dL/dmu * dmu/dW_mu = dL/dmu * activation^T
-        # Similarly for W_logstd
-        # Gradient ascent instead of descent
-        dL_dWmu = activation.T @ (rewards[:, None] * dlog_prob_dmean) * 1e-2
-        dL_dWlstd = activation.T @ (rewards[:, None] * dlog_prob_dlogstd) * 1e-2
-        # Update weights
-        dW = jnp.concatenate([dL_dWmu, dL_dWlstd], axis=-1)
+        dlog_prob_dlogstd = (sample - mean)**2 / std**3 - 1.0 / std
+        dL_dstd = dL_dlogp * dlog_prob_dlogstd
+        # Apply gradient clipping for logstd
+        dL_dlogstd = jnp.where(
+            (logstd <= -10.0) | (logstd >= 2.0),
+            0.0,  # Zero gradient when clipped
+            dL_dstd * std
+        )
+        dL_dWlogstd = activation.T @ dL_dlogstd # (I, B) @ (B, A) = (I, A)
+
+        # Update weights, negate the gradient because gradient ascent in ngc-learn
+        dW = jnp.concatenate([-dL_dWmu, -dL_dWlogstd], axis=-1)
         # Finally, return metrics if needed
         return dW, objective, outputs
 
-    @transition(output_compartments=["weights", "dWeights", "objective", "outputs", "accumulated_gradients", "step_count"])
+    @transition(output_compartments=["weights", "dWeights", "objective", "outputs", "accumulated_gradients", "step_count", "seed"])
     @staticmethod
-    def evolve(dt, w_bound, inputs, rewards, act_fx, weights, eta, learning_mask, decay, accumulated_gradients, step_count):
+    def evolve(dt, w_bound, inputs, rewards, act_fx, weights, eta, learning_mask, decay, accumulated_gradients, step_count, seed):
+        main_seed, sub_seed = jax.random.split(seed)
         dWeights, objective, outputs = REINFORCESynapse._compute_update(
-            dt, inputs, rewards, act_fx, weights
+            dt, inputs, rewards, act_fx, weights, sub_seed
         )
         ## do a gradient ascent update/shift
         weights = (weights + dWeights * eta) * learning_mask + weights * (1.0 - learning_mask) # update the weights only where learning_mask is 1.0
@@ -97,9 +111,9 @@ def evolve(dt, w_bound, inputs, rewards, act_fx, weights, eta, learning_mask, de
         step_count += 1
         accumulated_gradients = (step_count - 1) / step_count * accumulated_gradients * decay + 1.0 / step_count * dWeights # EMA update of accumulated gradients
         step_count = step_count * (1 - learning_mask) # reset the step count to 0 when we have learned
-        return weights, dWeights, objective, outputs, accumulated_gradients, step_count
+        return weights, dWeights, objective, outputs, accumulated_gradients, step_count, main_seed
 
-    @transition(output_compartments=["inputs", "outputs", "objective", "rewards", "dWeights", "accumulated_gradients", "step_count"])
+    @transition(output_compartments=["inputs", "outputs", "objective", "rewards", "dWeights", "accumulated_gradients", "step_count", "seed"])
     @staticmethod
     def reset(batch_size, shape):
         preVals = jnp.zeros((batch_size, shape[0]))
@@ -111,7 +125,8 @@ def reset(batch_size, shape):
         dWeights = jnp.zeros(shape)
         accumulated_gradients = jnp.zeros((shape[0], shape[1] * 2))
         step_count = jnp.zeros(())
-        return inputs, outputs, objective, rewards, dWeights, accumulated_gradients, step_count
+        seed = jax.random.PRNGKey(42)
+        return inputs, outputs, objective, rewards, dWeights, accumulated_gradients, step_count, seed
 
     @classmethod
     def help(cls): ## component help function
diff --git a/tests/components/synapses/modulated/test_REINFORCESynapse.py b/tests/components/synapses/modulated/test_REINFORCESynapse.py
@@ -25,10 +25,12 @@ def test_REINFORCESynapse1():
     dkey, *subkeys = random.split(dkey, 6)
     dt = 1.  # ms
     decay = 0.99
+    initial_seed = 42
+
     # ---- build a simple Poisson cell system ----
     with Context(name) as ctx:
         a = REINFORCESynapse(
-            name="a", shape=(1,1), decay=decay, act_fx="tanh", key=subkeys[0]
+            name="a", shape=(1,1), decay=decay, act_fx="tanh", key=subkeys[0], seed=initial_seed
         )
 
         evolve_process = (Process("evolve_proc") >> a.evolve)
@@ -43,6 +45,7 @@ def clamp_inputs(x):
 
         @Context.dynamicCommand
         def clamp_rewards(x):
+            assert x.ndim == 1, "Rewards must be a 1D array"
             a.rewards.set(x)
 
         @Context.dynamicCommand
@@ -51,20 +54,20 @@ def clamp_weights(x):
 
     # Function definition
     _act = jax.nn.tanh
-    def fn(params: dict, inputs: jax.Array, outputs: jax.Array):
+    def fn(params: dict, inputs: jax.Array, outputs: jax.Array, seed: jax.Array):
         W_mu, W_logstd = params
         activation = _act(inputs)
         mean = activation @ W_mu
         logstd = activation @ W_logstd
         std = jnp.exp(logstd.clip(-10.0, 2.0))
-        # sample = jax.random.normal(seed, mean.shape) * std + mean
-        sample = jnp.asarray(np.random.normal(0, 1, mean.shape)) * std + mean
+        sample = jax.random.normal(seed, mean.shape) * std + mean
         logp = gaussian_logpdf(sample, mean, std).sum(-1)
         # logp = jax.scipy.stats.norm.logpdf(sample, mean, std).sum(-1)
         return (-logp * outputs).mean() * 1e-2
     grad_fn = jax.value_and_grad(fn)
 
     # Some setups
+    expected_seed = jax.random.PRNGKey(initial_seed)
     expected_weights_mu = jnp.asarray([[0.13]])
     expected_weights_logstd = jnp.asarray([[0.04]])
     expected_weights = jnp.concatenate([expected_weights_mu, expected_weights_logstd], axis=-1)
@@ -73,102 +76,59 @@ def fn(params: dict, inputs: jax.Array, outputs: jax.Array):
     ctx.reset()
 
     # Loop through 3 steps
-    step = 1
-    # ---------------- Step {step} --------------------
-    print(f"------------ [Step {step}] ------------")
-    inputs = -1**step * jnp.ones((1, 1)) / 10  # * 0.5 * step / 10.0
-    outputs = -1**step * jnp.ones((1, 1)) / 10 # * 3 * step / 10.0# reward
-    # --------- ngclearn ---------
-    clamp_weights(initial_ngclearn_weights)
-    clamp_rewards(outputs)
-    clamp_inputs(inputs)
-    np.random.seed(42)
-    ctx.adapt(t=1., dt=dt)
-    print(f"[ngclearn] objective: {a.objective.value}")
-    print(f"[ngclearn] weights: {a.weights.value}")
-    print(f"[ngclearn] dWeights: {a.dWeights.value}")
-    print(f"[ngclearn] step_count: {a.step_count.value}")
-    print(f"[ngclearn] accumulated_gradients: {a.accumulated_gradients.value}")
-    # -------- Expectation ---------
-    print("--------------")
-    np.random.seed(42)
-    expected_objective, expected_grads = grad_fn(
-        (expected_weights_mu, expected_weights_logstd),
-        inputs,
-        outputs,
-    )
-    # NOTE: Viet: negate the gradient because gradient in ngc-learn
-    #   is gradient ascent, while gradient in JAX is gradient descent
-    expected_grads = -jnp.concatenate([expected_grads[0], expected_grads[1]], axis=-1)
-    expected_gradient_list.append(expected_grads)
-    print(f"[Expectation] expected_weights: {expected_weights}")
-    print(f"[Expectation] dWeights: {expected_grads}")
-    print(f"[Expectation] objective: {expected_objective}")
-    np.testing.assert_allclose(
-        a.dWeights.value[0],
-        expected_grads,
-        atol=1e-8
-    )
-    np.testing.assert_allclose(
-        a.objective.value,
-        expected_objective,
-        atol=1e-8
-    )
-    print()
-
-
-    step = 2
-    # ---------------- Step {step} --------------------
-    print(f"------------ [Step {step}] ------------")
-    inputs = -1**step * jnp.ones((1, 1)) / 10  # * 0.5 * step / 10.0
-    outputs = -1**step * jnp.ones((1, 1)) / 10 # * 3 * step / 10.0# reward
-    # --------- ngclearn ---------
-    clamp_weights(initial_ngclearn_weights)
-    clamp_rewards(outputs)
-    clamp_inputs(inputs)
-    np.random.seed(43)
-    ctx.adapt(t=1., dt=dt)
-    print(f"[ngclearn] objective: {a.objective.value}")
-    print(f"[ngclearn] weights: {a.weights.value}")
-    print(f"[ngclearn] dWeights: {a.dWeights.value}")
-    print(f"[ngclearn] step_count: {a.step_count.value}")
-    print(f"[ngclearn] accumulated_gradients: {a.accumulated_gradients.value}")
-    # -------- Expectation ---------
-    print("--------------")
-    np.random.seed(43)
-    expected_objective, expected_grads = grad_fn(
-        (expected_weights_mu, expected_weights_logstd),
-        inputs,
-        outputs,
-    )
-    # NOTE: Viet: negate the gradient because gradient in ngc-learn
-    #   is gradient ascent, while gradient in JAX is gradient descent
-    expected_grads = -jnp.concatenate([expected_grads[0], expected_grads[1]], axis=-1)
-    expected_gradient_list.append(expected_grads)
-    print(f"[Expectation] expected_weights: {expected_weights}")
-    print(f"[Expectation] dWeights: {expected_grads}")
-    print(f"[Expectation] objective: {expected_objective}")
-    np.testing.assert_allclose(
-        a.dWeights.value[0],
-        expected_grads,
-        atol=1e-8
-    )
-    np.testing.assert_allclose(
-        a.objective.value,
-        expected_objective,
-        atol=1e-8
-    )
-    print()
+    for step in range(10):
+        expected_seed, expected_subseed = jax.random.split(expected_seed)
+
+        # ---------------- Step {step} --------------------
+        print(f"------------ [Step {step}] ------------")
+        inputs = -1**step * jnp.ones((1, 1)) / 10  # * 0.5 * step / 10.0
+        outputs = -1**step * jnp.ones((1,)) / 10 # * 3 * step / 10.0# reward
+        # --------- ngclearn ---------
+        clamp_weights(initial_ngclearn_weights)
+        clamp_rewards(outputs)
+        clamp_inputs(inputs)
+        ctx.adapt(t=1., dt=dt)
+        print(f"[ngclearn] objective: {a.objective.value}")
+        print(f"[ngclearn] weights: {a.weights.value}")
+        print(f"[ngclearn] dWeights: {a.dWeights.value}")
+        print(f"[ngclearn] step_count: {a.step_count.value}")
+        print(f"[ngclearn] accumulated_gradients: {a.accumulated_gradients.value}")
+        # -------- Expectation ---------
+        print("--------------")
+        expected_objective, expected_grads = grad_fn(
+            (expected_weights_mu, expected_weights_logstd),
+            inputs,
+            outputs,
+            expected_subseed
+        )
+        # NOTE: Viet: negate the gradient because gradient in ngc-learn
+        #   is gradient ascent, while gradient in JAX is gradient descent
+        expected_grads = -jnp.concatenate([expected_grads[0], expected_grads[1]], axis=-1)
+        expected_gradient_list.append(expected_grads)
+        print(f"[Expectation] expected_weights: {expected_weights}")
+        print(f"[Expectation] dWeights: {expected_grads}")
+        print(f"[Expectation] objective: {expected_objective}")
+        np.testing.assert_allclose(
+            a.dWeights.value[0],
+            expected_grads,
+            atol=1e-8
+        )
+        np.testing.assert_allclose(
+            a.objective.value,
+            expected_objective,
+            atol=1e-8
+        )
+        print()
 
     # Finally, check if the accumulated gradients are correct
-    decay_list = jnp.asarray([decay**1, decay**0])
+    decay_list = jnp.asarray([decay**i for i in range(len(expected_gradient_list))])[::-1]
     expected_accumulated_gradients = jnp.mean(jnp.stack(expected_gradient_list, 0) * decay_list[:, None, None], axis=0)
     np.testing.assert_allclose(
         a.accumulated_gradients.value[0],
         expected_accumulated_gradients,
-        atol=1e-8
+        atol=1e-9
     )
 
 
-#test_REINFORCESynapse1()
+test_REINFORCESynapse1()