updates for Merge

amsks · amsks · commit c4a6d819fed2 · 2025-08-08T11:56:20.000+02:00
diff --git a/mighty/mighty_agents/base_agent.py b/mighty/mighty_agents/base_agent.py
@@ -141,7 +141,6 @@ def __init__(  # noqa: PLR0915, PLR0912
         normalize_obs: bool = False,
         normalize_reward: bool = False,
         rescale_action: bool = False,
-        handle_timeout_termination: bool = False,
     ):
         """Base agent initialization.
 
@@ -302,8 +301,6 @@ def __init__(  # noqa: PLR0915, PLR0912
             for m in self.meta_modules.values():
                 m.seed(self.seed)
         self.steps = 0
-        
-        self.handle_timeout_termination = handle_timeout_termination
 
     def _initialize_agent(self) -> None:
         """Agent/algorithm specific initializations."""
@@ -606,24 +603,21 @@ def run(  # noqa: PLR0915
                 metrics["episode_reward"] = episode_reward
 
                 action, log_prob = self.step(curr_s, metrics)
-                # 1) step the env as usual
+                # step the env as usual
                 next_s, reward, terminated, truncated, infos = self.env.step(action)
 
-                # 2) decide which samples are true “done”
+                # decide which samples are true “done”
                 replay_dones = terminated          # physics‐failure only
-                dones    = np.logical_or(terminated, truncated)
+                dones = np.logical_or(terminated, truncated)
                 
 
-                # 3) optionally overwrite next_s on truncation
-                if self.handle_timeout_termination:
-                    real_next_s = next_s.copy()
-                    # infos["final_observation"] is a list/array of the last real obs
-                    for i, tr in enumerate(truncated):
-                        if tr:
-                            real_next_s[i] = infos["final_observation"][i]
-                else:
-                    real_next_s = next_s
-
+                # Overwrite next_s on truncation
+                # Based on https://github.com/DLR-RM/stable-baselines3/issues/284    
+                real_next_s = next_s.copy()
+                # infos["final_observation"] is a list/array of the last real obs
+                for i, tr in enumerate(truncated):
+                    if tr:
+                        real_next_s[i] = infos["final_observation"][i]
                 episode_reward += reward
 
                 # Log everything
diff --git a/mighty/mighty_agents/dqn.py b/mighty/mighty_agents/dqn.py
@@ -69,7 +69,6 @@ def __init__(
         normalize_obs: bool = False,
         normalize_reward: bool = False,
         rescale_action: bool = False,  # type: ignore
-        handle_timeout_termination: bool = False,
     ):
         """DQN initialization.
 
@@ -155,7 +154,6 @@ def __init__(
             normalize_obs=normalize_obs,
             normalize_reward=normalize_reward,
             rescale_action=rescale_action,
-            handle_timeout_termination=handle_timeout_termination
         )
 
         self.loss_buffer = {
diff --git a/mighty/mighty_agents/ppo.py b/mighty/mighty_agents/ppo.py
@@ -62,7 +62,6 @@ def __init__(
         normalize_reward: bool = False,
         rescale_action: bool = False,
         tanh_squash: bool = False,
-        handle_timeout_termination: bool = False,
     ):
         """Initialize the PPO agent.
 
@@ -144,7 +143,6 @@ def __init__(
             normalize_obs=normalize_obs,
             normalize_reward=normalize_reward,
             rescale_action=rescale_action,
-            handle_timeout_termination=handle_timeout_termination
         )
 
         self.loss_buffer = {
diff --git a/mighty/mighty_agents/sac.py b/mighty/mighty_agents/sac.py
@@ -57,7 +57,6 @@ def __init__(
         rescale_action: bool = False,  # ← NEW Whether to rescale actions to the environment's action space
         policy_frequency: int = 2,  # Frequency of policy updates
         target_network_frequency: int = 1,  # Frequency of target network updates
-        handle_timeout_termination: bool = True,
     ):
         """Initialize SAC agent with tunable hyperparameters and backward-compatible names."""
         if hidden_sizes is None:
@@ -117,7 +116,6 @@ def __init__(
             rescale_action=rescale_action,
             batch_size=batch_size,
             learning_rate=policy_lr,  # For compatibility with base class
-            handle_timeout_termination=handle_timeout_termination,
         )
 
         # Initialize loss buffer for logging
@@ -209,9 +207,8 @@ def process_transition(
         # Ensure metrics dict
         if metrics is None:
             metrics = {}
+        
         # Pack transition    
-        # `terminated` is used for physics failures in environments like `MightyEnv`
-        # Based on https://github.com/DLR-RM/stable-baselines3/issues/284    
         terminated = metrics["transition"]["terminated"]  # physics‐failures
         transition = TransitionBatch(curr_s, action, reward, next_s, terminated.astype(int))
         
diff --git a/mighty/mighty_exploration/mighty_exploration_policy.py b/mighty/mighty_exploration/mighty_exploration_policy.py
@@ -115,11 +115,9 @@ def sample_func_logits(self, state_array):
 
         # ─── Continuous squashed‐Gaussian (4‐tuple) ──────────────────────────
         elif isinstance(out, tuple) and len(out) == 4:
-            action = out[0]  # [batch, action_dim]
-            
-            print(f'Self Model : {self.model}')
+            action = out[0]  # [batch, action_dim]           
             log_prob = sample_nondeterministic_logprobs(
-                z=out[1], mean=out[2], log_std=out[3], sac=isinstance(self.model, SACModel)
+                z=out[1], mean=out[2], log_std=out[3], sac= self.ago == "sac"
             )
             return action.detach().cpu().numpy(), log_prob
 
diff --git a/mighty/mighty_exploration/stochastic_policy.py b/mighty/mighty_exploration/stochastic_policy.py
@@ -103,7 +103,7 @@ def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tenso
                 if return_logp:
                     return action.detach().cpu().numpy(), log_prob
                 else:
-                    weighted_log_prob = log_prob
+                    weighted_log_prob = log_prob * self.entropy_coefficient
                     return action.detach().cpu().numpy(), weighted_log_prob
 
             # Check for model attribute-based approaches
diff --git a/mighty/mighty_models/sac.py b/mighty/mighty_models/sac.py
@@ -32,7 +32,7 @@ def __init__(
         # This model is continuous only
         self.continuous_action = True
         
-        # PR: register the per-dim scale and bias so we can rescale [-1,1]→[low,high].
+        # Register the per-dim scale and bias so we can rescale [-1,1]→[low,high].
         action_low = torch.as_tensor(action_low, dtype=torch.float32)
         action_high = torch.as_tensor(action_high, dtype=torch.float32)
         self.register_buffer(
@@ -67,42 +67,75 @@ def __init__(
         self.hidden_sizes = feature_extractor_kwargs.get("hidden_sizes", [256, 256])
         self.activation = feature_extractor_kwargs.get("activation", "relu")
 
-        # Shared feature extractor for policy
-        self.feature_extractor, out_dim = make_feature_extractor(
+        # Policy feature extractor and head
+        self.policy_feature_extractor, policy_feat_dim = make_feature_extractor(
             **feature_extractor_kwargs
         )
-
-        # Policy network outputs mean and log_std
-        # CHANGE: Create separate policy network (actor) similar to CleanRL
-        self.policy_net = make_policy_head(
-            in_size=self.obs_size,
+        
+        # Policy head: just the final output layer
+        self.policy_head = make_policy_head(
+            in_size=policy_feat_dim,
             out_size=self.action_size * 2,  # mean and log_std
-            **head_kwargs
+            hidden_sizes=[],  # No hidden layers, just final linear layer
+            activation=head_kwargs["activation"]
         )
 
-        # Twin Q-networks
-        # — live Q-nets —
-        self.q_net1 = make_q_head(
-            in_size=self.obs_size + self.action_size, **head_kwargs
+        # Create policy_net for backward compatibility
+        self.policy_net = nn.Sequential(self.policy_feature_extractor, self.policy_head)
+
+        # Q-networks: feature extractors + heads
+        q_feature_extractor_kwargs = feature_extractor_kwargs.copy()
+        q_feature_extractor_kwargs["obs_shape"] = self.obs_size + self.action_size
+        
+        # Q-network 1
+        self.q_feature_extractor1, q_feat_dim = make_feature_extractor(**q_feature_extractor_kwargs)
+        self.q_head1 = make_q_head(
+            in_size=q_feat_dim,
+            hidden_sizes=[],  # No hidden layers, just final linear layer
+            activation=head_kwargs["activation"]
         )
-        self.q_net2 = make_q_head(
-            in_size=self.obs_size + self.action_size, **head_kwargs
+        self.q_net1 = nn.Sequential(self.q_feature_extractor1, self.q_head1)
+
+        # Q-network 2
+        self.q_feature_extractor2, _ = make_feature_extractor(**q_feature_extractor_kwargs)
+        self.q_head2 = make_q_head(
+            in_size=q_feat_dim,
+            hidden_sizes=[],  # No hidden layers, just final linear layer
+            activation=head_kwargs["activation"]
         )
+        self.q_net2 = nn.Sequential(self.q_feature_extractor2, self.q_head2)
 
         # Target Q-networks
-        self.target_q_net1 = make_q_head(
-            in_size=self.obs_size + self.action_size, **head_kwargs
+        self.target_q_feature_extractor1, _ = make_feature_extractor(**q_feature_extractor_kwargs)
+        self.target_q_head1 = make_q_head(
+            in_size=q_feat_dim,
+            hidden_sizes=[],  # No hidden layers, just final linear layer
+            activation=head_kwargs["activation"]
         )
-        self.target_q_net1.load_state_dict(self.q_net1.state_dict())
-        self.target_q_net2 = make_q_head(
-            in_size=self.obs_size + self.action_size, **head_kwargs
+        self.target_q_net1 = nn.Sequential(self.target_q_feature_extractor1, self.target_q_head1)
+
+        self.target_q_feature_extractor2, _ = make_feature_extractor(**q_feature_extractor_kwargs)
+        self.target_q_head2 = make_q_head(
+            in_size=q_feat_dim,
+            hidden_sizes=[],  # No hidden layers, just final linear layer
+            activation=head_kwargs["activation"]
         )
-        self.target_q_net2.load_state_dict(self.q_net2.state_dict())
+        self.target_q_net2 = nn.Sequential(self.target_q_feature_extractor2, self.target_q_head2)
+
+        # Copy weights from live to target networks
+        self.target_q_feature_extractor1.load_state_dict(self.q_feature_extractor1.state_dict())
+        self.target_q_head1.load_state_dict(self.q_head1.state_dict())
+        self.target_q_feature_extractor2.load_state_dict(self.q_feature_extractor2.state_dict())
+        self.target_q_head2.load_state_dict(self.q_head2.state_dict())
 
         # Freeze target networks
-        for p in self.target_q_net1.parameters():
+        for p in self.target_q_feature_extractor1.parameters():
+            p.requires_grad = False
+        for p in self.target_q_head1.parameters():
+            p.requires_grad = False
+        for p in self.target_q_feature_extractor2.parameters():
             p.requires_grad = False
-        for p in self.target_q_net2.parameters():
+        for p in self.target_q_head2.parameters():
             p.requires_grad = False
 
         # Create a value function wrapper for compatibility
@@ -133,7 +166,7 @@ def forward(
         Forward pass for policy sampling.
 
         Returns:
-          action: torch.Tensor in [-1,1]
+          action: torch.Tensor in rescaled range [action_low, action_high]
           z: raw Gaussian sample before tanh
           mean: Gaussian mean
           log_std: Gaussian log std
@@ -155,7 +188,7 @@ def forward(
         # tanh→[-1,1]
         raw_action = torch.tanh(z)
 
-        # **HERE** we rescale into [low,high]
+        # Rescale into [action_low, action_high]
         action = raw_action * self.action_scale + self.action_bias
         
         return action, z, mean, log_std
diff --git a/test/models/test_sac_networks.py b/test/models/test_sac_networks.py