removed instance comparisons in stochastic and exploration policies

amsks · amsks · commit a47ace993eef · 2025-08-08T12:06:05.000+02:00
diff --git a/mighty/mighty_agents/sac.py b/mighty/mighty_agents/sac.py
@@ -145,7 +145,7 @@ def _initialize_agent(self) -> None:
 
         # Exploration policy wrapper
         self.policy = self.policy_class(
-            algo=self, model=self.model, **self.policy_kwargs
+            algo="sac", model=self.model, **self.policy_kwargs
         )
 
         # Updater
@@ -207,11 +207,13 @@ def process_transition(
         # Ensure metrics dict
         if metrics is None:
             metrics = {}
-        
-        # Pack transition    
+
+        # Pack transition
         terminated = metrics["transition"]["terminated"]  # physics‐failures
-        transition = TransitionBatch(curr_s, action, reward, next_s, terminated.astype(int))
-        
+        transition = TransitionBatch(
+            curr_s, action, reward, next_s, terminated.astype(int)
+        )
+
         # Compute per-transition TD errors for logging
         td1, td2 = self.update_fn.calculate_td_error(transition)
         metrics["td_error1"] = td1.detach().cpu().numpy()
diff --git a/mighty/mighty_exploration/mighty_exploration_policy.py b/mighty/mighty_exploration/mighty_exploration_policy.py
@@ -12,10 +12,7 @@
 
 
 def sample_nondeterministic_logprobs(
-    z: torch.Tensor,
-    mean: torch.Tensor,
-    log_std: torch.Tensor,
-    sac: bool = False
+    z: torch.Tensor, mean: torch.Tensor, log_std: torch.Tensor, sac: bool = False
 ) -> torch.Tensor:
     """
     Compute log-prob of a Gaussian sample z ~ N(mean, exp(log_std)),
@@ -115,9 +112,9 @@ def sample_func_logits(self, state_array):
 
         # ─── Continuous squashed‐Gaussian (4‐tuple) ──────────────────────────
         elif isinstance(out, tuple) and len(out) == 4:
-            action = out[0]  # [batch, action_dim]           
+            action = out[0]  # [batch, action_dim]
             log_prob = sample_nondeterministic_logprobs(
-                z=out[1], mean=out[2], log_std=out[3], sac= self.ago == "sac"
+                z=out[1], mean=out[2], log_std=out[3], sac=self.ago == "sac"
             )
             return action.detach().cpu().numpy(), log_prob
 
diff --git a/mighty/mighty_exploration/stochastic_policy.py b/mighty/mighty_exploration/stochastic_policy.py
@@ -27,13 +27,12 @@ def __init__(
         :param entropy_coefficient: weight on entropy term
         :param discrete: whether the action space is discrete
         """
-        
+
         self.model = model
-        
+
         super().__init__(algo, model, discrete)
         self.entropy_coefficient = entropy_coefficient
         self.discrete = discrete
-        
 
         # --- override sample_action only for continuous SAC ---
         if not discrete and isinstance(model, SACModel):
@@ -88,9 +87,9 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
             # 4-tuple case (Tanh squashing): (action, z, mean, log_std)
             elif isinstance(model_output, tuple) and len(model_output) == 4:
                 action, z, mean, log_std = model_output
-                
-                if not isinstance(self.model, SACModel):
-                
+
+                if not self.algo == "sac":
+
                     log_prob = sample_nondeterministic_logprobs(
                         z=z,
                         mean=mean,
@@ -121,8 +120,8 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
                 elif len(model_output) == 4:
                     # Tanh squashing mode: (action, z, mean, log_std)
                     action, z, mean, log_std = model_output
-                    if not isinstance(self.model, SACModel):
-                
+                    if not self.algo == "sac":
+
                         log_prob = sample_nondeterministic_logprobs(
                             z=z,
                             mean=mean,
@@ -147,7 +146,7 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
                 if self.model.output_style == "squashed_gaussian":
                     # Should be 4-tuple: (action, z, mean, log_std)
                     action, z, mean, log_std = model_output
-                    if not isinstance(self.model, SACModel):
+                    if not self.algo == "sac":
                         log_prob = sample_nondeterministic_logprobs(
                             z=z,
                             mean=mean,
@@ -170,7 +169,7 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
                     z = dist.rsample()
                     action = torch.tanh(z)
 
-                    if not isinstance(self.model, SACModel):
+                    if not self.algo == "sac":
                         log_prob = sample_nondeterministic_logprobs(
                             z=z,
                             mean=mean,
@@ -179,7 +178,7 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
                         )
                     else:
                         log_prob = self.model.policy_log_prob(z, mean, log_std)
-                        
+
                     entropy = dist.entropy().sum(dim=-1, keepdim=True)
                     weighted_log_prob = log_prob * entropy
                     return action.detach().cpu().numpy(), weighted_log_prob
@@ -190,7 +189,7 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
                     )
 
             # Special handling for SACModel
-            elif isinstance(self.model, SACModel):
+            elif self.algo == "sac" and isinstance(self.model, SACModel):
                 action, z, mean, log_std = self.model(state, deterministic=False)
                 # CRITICAL: Use the model's policy_log_prob which includes tanh correction
                 log_prob = self.model.policy_log_prob(z, mean, log_std)