comments

will-maclean · will-maclean · commit 4b19cd741745 · 2025-11-01T21:26:25.000+11:00
diff --git a/src/common/distributions/action_distribution.rs b/src/common/distributions/action_distribution.rs
@@ -46,20 +46,6 @@ where
     }
 }
 
-/// Continuous actions are usually considered to be independent,
-/// so we can sum components of the ``log_prob`` or the entropy.
-///
-/// # Shapes
-/// t: (batch, n_actions) or (batch)
-/// return: (batch) for (batch, n_actions) input, or (1) for (batch) input
-// fn sum_independent_dims<B: Backend>(t: Tensor<B, 1>) -> Tensor<B, 1>{
-//     t.sum()
-// }
-
-// fn sum_independent_dims_batched<B: Backend>(t: Tensor<B, 1>) -> Tensor<B, 1>{
-//     t.sum_dim(1).squeeze(1)
-// }
-
 #[derive(Debug, Module)]
 pub struct DiagGaussianDistribution<B: Backend> {
     means: Linear<B>,
@@ -87,10 +73,11 @@ impl<B: Backend> DiagGaussianDistribution<B> {
 
 impl<B: Backend> ActionDistribution<B> for DiagGaussianDistribution<B> {
     fn log_prob(&self, sample: Tensor<B, 2>) -> Tensor<B, 2> {
+        // (B, N)
         let log_prob = self.dist.log_prob(sample);
 
-        // TODO: add sum_independent_dims when multi-dim actions are supported
-        log_prob
+        // (B, 1)
+        log_prob.sum_dim(1)
     }
 
     fn entropy(&self) -> Tensor<B, 2> {
@@ -107,13 +94,12 @@ impl<B: Backend> ActionDistribution<B> for DiagGaussianDistribution<B> {
 
     fn actions_from_obs(&mut self, obs: Tensor<B, 2>, deterministic: bool) -> Tensor<B, 2> {
         let loc = self.means.forward(obs.clone());
+        let scale: Tensor<B, 2> = self.log_std.forward(obs).clamp(-20, 2).exp();
+        self.dist = Normal::new(loc.clone(), scale);
 
         if deterministic {
-            loc
+            self.dist.mean()
         } else {
-            let scale: Tensor<B, 2> = self.log_std.forward(obs).clamp(-20, 2).exp();
-            self.dist = Normal::new(loc.clone(), scale);
-
             self.dist.rsample()
         }
     }
@@ -141,22 +127,29 @@ impl<B: Backend> SquashedDiagGaussianDistribution<B> {
     }
 
     fn log_prob_correction(&self, ln_u: Tensor<B, 2>, a: Tensor<B, 2>) -> Tensor<B, 2> {
-        ln_u - ((1.0 - a.powi_scalar(2.0) + self.epsilon) as Tensor<B, 2>)
+        // ln_u: (B, 1)
+        // a: (B, N)
+
+        // (B, 1)
+        let correction = ((1.0 - a.powi_scalar(2.0) + self.epsilon) as Tensor<B, 2>)
             .log()
-            .sum_dim(1)
+            .sum_dim(1);
+
+        // (B, 1)
+        ln_u - correction
     }
 }
 
 impl<B: Backend> ActionDistribution<B> for SquashedDiagGaussianDistribution<B> {
     fn log_prob(&self, a: Tensor<B, 2>) -> Tensor<B, 2> {
+        // (B, N)
         let u = tanh_bijector_inverse(a.clone());
-        let ln_u = self.diag_gaus_dist.log_prob(u);
 
-        // Squash correction (from original SAC implementation)
-        // this comes from the fact that tanh is bijective and differentiable
-        let ln_a = self.log_prob_correction(ln_u, a);
+        // (B, 1)
+        let ln_u = self.diag_gaus_dist.log_prob(u);
 
-        ln_a
+        // (B, 1)
+        self.log_prob_correction(ln_u, a)
     }
 
     fn entropy(&self) -> Tensor<B, 2> {
diff --git a/src/common/distributions/normal.rs b/src/common/distributions/normal.rs
@@ -89,16 +89,6 @@ impl<B: Backend, const D: usize> BaseDistribution<B, D> for Normal<B, D> {
         let var = self.variance();
 
         -(value - self.loc.clone()).powi_scalar(2) / (2 * var) - log_scale - (2.0 * PI).sqrt().ln()
-
-        // log_scale
-        //     .mul_scalar(-1.0)
-        //     .add_scalar(-0.5 * (2.0 * PI).log(E))
-        //     .sub(
-        //         (value - self.loc.clone())
-        //             .powi_scalar(2)
-        //             .div(var)
-        //             .mul_scalar(0.5),
-        //     )
     }
 
     fn cdf(&self, _value: Tensor<B, D>) -> Tensor<B, D> {
diff --git a/src/sac/agent.rs b/src/sac/agent.rs
@@ -296,21 +296,7 @@ impl<B: AutodiffBackend> SACAgent<B> {
 
         let actor_loss_back = actor_loss.backward();
         let actor_grads = GradientsParams::from_grads(actor_loss_back, &self.pi);
-
-        // do some checks to see that pi is actually updating
-        // let mut pre_pi_summary = ModuleParamSummary::default();
-        // self.pi.visit(&mut pre_pi_summary);
-
         self.pi = self.pi_optim.step(lr, self.pi.clone(), actor_grads);
-        // let mut post_pi_summary = ModuleParamSummary::default();
-        // self.pi.visit(&mut post_pi_summary);
-
-        // println!("Pi Summary pre-step");
-        // pre_pi_summary.print();
-        // println!("Pi Summary post-step");
-        // post_pi_summary.print();
-        //
-        // panic!();
 
         log_dict
     }
@@ -407,8 +393,6 @@ impl<B: AutodiffBackend> Agent<B, Vec<f32>, Vec<f32>> for SACAgent<B> {
             ),
         );
 
-        let log_prob = log_prob.sum_dim(1);
-
         self.profiler
             .record("policy", t_policy0.elapsed().as_secs_f64());
 
diff --git a/src/sac/models.rs b/src/sac/models.rs
@@ -53,7 +53,7 @@ impl<B: Backend> PiModel<B> {
 
     pub fn act_log_prob(&mut self, obs: Tensor<B, 2>) -> (Tensor<B, 2>, Tensor<B, 2>) {
         let latent = self.mlp.forward(obs.clone().unsqueeze());
-        self.dist.actions_from_obs_with_log_probs(latent, deterministic)
+        self.dist.actions_from_obs_with_log_probs(latent, false)
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ impl<B: Backend> PiModel<B> {`
`53`	`53`
`54`	`54`	`pub fn act_log_prob(&mut self, obs: Tensor<B, 2>) -> (Tensor<B, 2>, Tensor<B, 2>) {`
`55`	`55`	`let latent = self.mlp.forward(obs.clone().unsqueeze());`
`56`		`- self.dist.actions_from_obs_with_log_probs(latent, deterministic)`
	`56`	`+ self.dist.actions_from_obs_with_log_probs(latent, false)`
`57`	`57`	`}`
`58`	`58`	`}`
`59`	`59`