d3group
diff --git a/‎ddopai/_modidx.py‎
Lines changed: 579 additions & 27 deletions b/‎ddopai/_modidx.py‎
Lines changed: 579 additions & 27 deletions
diff --git a/‎ddopai/agents/dynamic_pricing/inventory_constrained/IDP.py‎
Lines changed: 3 additions & 1 deletion b/‎ddopai/agents/dynamic_pricing/inventory_constrained/IDP.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎ddopai/agents/ml_utils.py‎
Lines changed: 87 additions & 1 deletion b/‎ddopai/agents/ml_utils.py‎
Lines changed: 87 additions & 1 deletion
diff --git a/‎ddopai/agents/rl/RL2ppo.py‎
Lines changed: 7 additions & 13 deletions b/‎ddopai/agents/rl/RL2ppo.py‎
Lines changed: 7 additions & 13 deletions
diff --git a/‎ddopai/envs/pricing/dynamic.py‎
Lines changed: 5 additions & 5 deletions b/‎ddopai/envs/pricing/dynamic.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎ddopai/experiments/experiment_functions_meta.py‎
Lines changed: 55 additions & 10 deletions b/‎ddopai/experiments/experiment_functions_meta.py‎
Lines changed: 55 additions & 10 deletions
diff --git a/‎ddopai/meta_learning/__init__.py‎ b/‎ddopai/meta_learning/__init__.py‎
diff --git a/‎ddopai/meta_learning/algorithms/__init__.py‎ b/‎ddopai/meta_learning/algorithms/__init__.py‎
@@ -58,7 +58,7 @@ def __init__(self,
 
     def draw_action(self, observation: np.ndarray):
         X = observation['features']
-        B_t = observation['Inventory']
+        B_t = observation['inventory']
         price = self.price_function(X, self.alpha, self.beta)
         lagrangian = self.lagrangian(B_t)
         price = price + lagrangian
@@ -74,6 +74,7 @@ def lagrangian(self, B_t):
         avg_remaining_B = (2 * B_t) / (self.T - self.t +1) 
         lagrangian = (avg_remaining_B - np.dot(self.alpha, self.E_X)) / np.dot(self.beta, self.E_X)
         return lagrangian
+    
     def update_task(self, env):
         self.environment_info = env.mdp_info
         self.task = env.get_task()
@@ -85,6 +86,7 @@ def update_task(self, env):
         else:
             self.E_X = np.full(self.environment_info.observation_space['features'].shape[0], 1 / (2 * np.sqrt(self.environment_info.observation_space['features'].shape[0])))
         self.T = self.task["horizon"]
+        self.t = 0
 
         """TODO add change in price function"""
     def fit(self, X, Y, action):
 
@@ -3,12 +3,18 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/30_agents/40_ml_utils.ipynb.
 
 # %% auto 0
-__all__ = ['LRSchedulerPerStep']
+__all__ = ['LRSchedulerPerStep', 'init_gru', 'init_module', 'init_mlp']
 
 # %% ../../nbs/30_agents/40_ml_utils.ipynb 3
 from typing import  List, Tuple, Literal
 import torch
 
+from typing import Callable, List
+
+import numpy as np
+from torch import nn as nn
+from torch.nn.utils import weight_norm
+
 # %% ../../nbs/30_agents/40_ml_utils.ipynb 4
 class LRSchedulerPerStep():
     """
@@ -39,3 +45,83 @@ def step(self):
 
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = lr
+
+# %% ../../nbs/30_agents/40_ml_utils.ipynb 5
+def init_gru(input_size: int, recurrent_state_size: int) -> nn.Module:
+    """
+    Initialize a GRU module.
+
+    Args:
+        input_size (int): Input size to the GRU.
+        recurrent_state_size (int): Recurrent state size for the GRU.
+
+    Returns:
+        nn.Module
+    """
+    gru = nn.GRU(input_size, recurrent_state_size)
+
+    for name, param in gru.named_parameters():
+        if "bias" in name:
+            nn.init.constant_(param, 0)
+        elif "weight" in name:
+            nn.init.orthogonal_(param)
+
+    return gru
+
+def init_module(
+    module: nn.Module, weight_init: Callable, bias_init: Callable, gain: float = 1.0
+) -> nn.Module:
+    """
+    Initialize a module with the given weight and bias functions.
+
+    Args:
+        module (nn.Module): Module that is to be initialized with the given weight and bias.
+        weight_init (Callable): Function for initializing weights.
+        bias_init (Callable): Function for initialize biases.
+        gain (float): Gain amount.
+
+    Returns:
+        nn.Module
+    """
+    weight_init(module.weight.data, gain=gain)
+    bias_init(module.bias.data)
+    weight_norm(module)
+
+    return module
+
+
+def init_mlp(input_size: int, hidden_sizes: List[int]) -> nn.Sequential:
+    """
+    Initialize the value head for the critic.
+
+    Args:
+        input_size (List[int]): Size of the recurrent state in the base RNN.
+        hidden_sizes (List[int]): Sizes of the hidden layers of the MLP.
+
+    Returns:
+        nn.Sequential
+    """
+
+    def _init_orthogonal(m: nn.Module):
+        return init_module(
+            m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), np.sqrt(2)
+        )
+
+    feature_sizes = list([input_size])
+    feature_sizes.extend(hidden_sizes)
+
+    mlp_modules = list()
+    for i in range(len(feature_sizes) - 1):
+        hidden_layer = _init_orthogonal(
+            nn.Linear(feature_sizes[i], feature_sizes[i + 1])
+        )
+
+        # zero bias
+        torch.nn.init.zeros_(hidden_layer.bias)
+        mlp_modules.append(hidden_layer)
+
+        # relu
+        mlp_modules.append(nn.ReLU())
+        pass
+
+    return nn.Sequential(*mlp_modules)
@@ -1,11 +1,11 @@
 """PPO based agent"""
 
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../../nbs/30_agents/51_RL_agents/10_RL2PPO_agents.ipynb.
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../../nbs/30_agents/51_RL_agents/10_RL2_agents.ipynb.
 
 # %% auto 0
 __all__ = ['GaussianTorchPolicyRL2', 'RL2PPO', 'RL2PPOAgent']
 
-# %% ../../../nbs/30_agents/51_RL_agents/10_RL2PPO_agents.ipynb 4
+# %% ../../../nbs/30_agents/51_RL_agents/10_RL2_agents.ipynb 4
 import logging
 
 # set logging level to INFO
@@ -44,7 +44,7 @@
 from itertools import chain
 import time
 
-# %% ../../../nbs/30_agents/51_RL_agents/10_RL2PPO_agents.ipynb 5
+# %% ../../../nbs/30_agents/51_RL_agents/10_RL2_agents.ipynb 6
 class GaussianTorchPolicyRL2(TorchPolicy):
     """
     Torch policy implementing a Gaussian policy with trainable standard
@@ -176,7 +176,7 @@ def parameters(self):
         return chain(self._mu.model.network.parameters(), [self._log_sigma])
 
 
-# %% ../../../nbs/30_agents/51_RL_agents/10_RL2PPO_agents.ipynb 6
+# %% ../../../nbs/30_agents/51_RL_agents/10_RL2_agents.ipynb 7
 class RL2PPO(Agent):
     """
     Proximal Policy Optimization (PPO) Agent supporting sequential data and RL² compatibility.
@@ -479,7 +479,7 @@ def get_tensor(field, dtype=None):
             self.standardize_advantages(meta_episodes)
         # Convert stored fields to torch tensors.
         mb_obs     = get_tensor("obs")         # shape: (B, T, obs_dim)
-        mb_acs     = get_tensor("acs", "long")   # shape: (B, T, action_dim)
+        mb_acs     = get_tensor("acs")   # shape: (B, T, action_dim)
         mb_rews    = get_tensor("rews")          # shape: (B, T)
         mb_dones   = get_tensor("dones")         # shape: (B, T)
         mb_logpacs = get_tensor("logpacs")       # shape: (B, T) or (B, T, 1)
@@ -541,7 +541,7 @@ def _post_load(self):
             update_optimizer_parameters(self._optimizer, list(self.policy.parameters()))
 
 
-# %% ../../../nbs/30_agents/51_RL_agents/10_RL2PPO_agents.ipynb 7
+# %% ../../../nbs/30_agents/51_RL_agents/10_RL2_agents.ipynb 8
 class RL2PPOAgent(MushroomBaseAgent):
     """
     RL² PPO Agent for meta-learning, based on recurrent policy/value networks and MushroomRL core agent.
@@ -682,13 +682,7 @@ def reset_hidden(self, batch_size=1, device='cpu'):
             device (str): device where hidden states should be allocated ('cpu' or 'cuda').
         """
         self.agent.reset_hidden_state(batch_size=batch_size, device=device)
-        # Reset actor hidden state
-        #actor_network = self.agent.policy._mu._impl.model.network
-        #actor_network.hidden_state = actor_network.model.rnn.model.init_hidden(batch_size=batch_size, device=device)
-
-        # Reset critic hidden state
-        #critic_network = self.agent._V._impl.model.network
-        #critic_network.hidden_state = critic_network.model.rnn.model.init_hidden(batch_size=batch_size, device=device)
+        
 
     def predict_(self, observation: np.ndarray) -> np.ndarray:
         """
 
@@ -132,10 +132,10 @@ def step_(self,
         truncated = self.set_index()
 
         info = dict(
-            inv=self.inv * self.relative_inv,
+            inv=(self.inv * self.relative_inv)[0],
             demand=demand,
             true_demand=true_demand,
-            action=action.copy(),
+            action=action[0],
             reward=reward,
             true_reward=true_reward,
             alpha=alpha,
@@ -163,11 +163,11 @@ def get_observation(self):
         Function to get the observation from the dataloader.
         """
         x, reward_functions = self.dataloader[self.index]
-        current_inv = np.array([self.relative_inv], dtype=np.float32)
+        current_inv = self.relative_inv
 
         observation = {
             "features": x,
-            "inventory": current_inv
+            "inventory": current_inv * self.inv
         }
         return observation, reward_functions
 
@@ -191,7 +191,7 @@ def update_episode_params(self):
         """
         Update the parameters of the episode.
         """
-        inv = np.array(self.task["inv_level"])
+        inv = np.array([self.task["inv_level"]])
         relative_inv = np.ones_like(inv, dtype=np.float32)
         if hasattr(self, "inv"):
             self.set_param("inv", inv, inv.shape, new=False)
 
@@ -3,8 +3,8 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/40_experiments/10_experiment_functions_meta.ipynb.
 
 # %% auto 0
-__all__ = ['EarlyStoppingHandler', 'calculate_score', 'log_info', 'log_info_history', 'update_best', 'save_agent', 'test_agent',
-           'run_test_episode', 'run_experiment']
+__all__ = ['EarlyStoppingHandler', 'calculate_score', 'log_info', 'log_info_history', 'log_figure_from_history', 'update_best',
+           'save_agent', 'test_agent', 'run_test_episode', 'run_experiment']
 
 # %% ../../nbs/40_experiments/10_experiment_functions_meta.ipynb 3
 from abc import ABC, abstractmethod
@@ -14,7 +14,8 @@
 import numpy as np
 import sys
 import wandb
-
+import matplotlib.pyplot as plt
+import seaborn as sns
 from ..envs.base import BaseEnvironment
 from ..agents.base import BaseAgent
 
@@ -124,13 +125,55 @@ def log_info(R: float,
 def log_info_history(info: list,
                      episode: int,
                      tracking: Literal["wandb"], # only wandb implemented so far
-                     mode: Literal["train", "val", "test"]
-                     ):
+                     mode: Literal["train", "val", "test"],
+                     commit: bool = False):
     if tracking == "wandb":
-        table = wandb.Table(columns=["t", "action", "reward", "true_reward", "alpha", "beta", "episode"])
+        table = wandb.Table(columns=["t", "action", "reward", "true_reward", "alpha", "beta", "inventory", "episode"])
         for t, row in enumerate(info):
-            table.add_data(t, row["action"], row["reward"], row["true_reward"], row["alpha"], row["beta"], episode)
-        wandb.log({f"{mode}/info_table": table})
+            table.add_data(t, row["action"], row["reward"], row["true_reward"], row["alpha"], row["beta"], row["inv"], episode)
+        wandb.log({f"{mode}/info_table": table}, commit=commit)
+        
+def log_figure_from_history(info: list,
+                            episode: int,
+                     tracking: Literal["wandb"], # only wandb implemented so far
+                     mode: Literal["train", "val", "test"],
+                    commit: bool = True
+                     ):
+    if tracking == "wandb":
+        # Plot reward and true reward over time
+        plt.figure(figsize=(10, 6))
+        sns.lineplot(x=list(range(len(info))), y=[row["reward"] for row in info], label="Reward")
+        sns.lineplot(x=list(range(len(info))), y=[row["true_reward"] for row in info], label="True Reward")
+        plt.title("Reward and True Reward over time")
+        plt.xlabel("T")
+        plt.ylabel("Reward")
+        plt.legend()
+        reward_fig = plt.gcf()
+        wandb.log({f"{mode}/reward_over_time_image": wandb.Image(reward_fig)}, commit=False)
+        plt.close(reward_fig)
+
+        # Plot action over time
+        plt.figure(figsize=(10, 6))
+        sns.lineplot(x=list(range(len(info))), y=[row["action"] for row in info], label="Action")
+        plt.title("Action over time")
+        plt.xlabel("T")
+        plt.ylabel("Action")
+        plt.legend()
+        action_fig = plt.gcf()
+        wandb.log({f"{mode}/action_over_time_image": wandb.Image(action_fig)}, commit=False)
+        plt.close(action_fig)
+
+        # Plot inventory over time
+        plt.figure(figsize=(10, 6))
+        sns.lineplot(x=list(range(len(info))), y=[row["inv"] for row in info], label="Inventory")
+        plt.title("Inventory over time")
+        plt.xlabel("T")
+        plt.ylabel("Inventory")
+        plt.legend()
+        inventory_fig = plt.gcf()
+        wandb.log({f"{mode}/inventory_over_time_image": wandb.Image(inventory_fig)}, commit=commit)
+        plt.close(inventory_fig)
+        
 
 def update_best(R: float, J: float, best_R: float, best_J: float): # 
 
@@ -209,7 +252,8 @@ def test_agent(agent: BaseAgent,
         if tracking == "wandb":
             mode = env.mode
             wandb.log({f"{mode}/Episode":episode,f"{mode}/R": R, f"{mode}/J": J}, commit=False)
-            log_info_history([ep_d[1] for ep_d in episode_dataset], episode, tracking, mode)
+            log_info_history([ep_d[1] for ep_d in episode_dataset], episode, tracking, mode, commit=False)
+            log_figure_from_history([ep_d[1] for ep_d in episode_dataset], episode, tracking, mode, commit=True)
     if return_dataset:
         return np.mean(list_R), np.mean(list_J), dataset
     else:
@@ -362,7 +406,8 @@ def run_experiment( agent: BaseAgent,
                 R_list.append(R)
                 J_list.append(J)
             wandb.log({f"test/R": R, f"test/J": J}, commit=False)
-            log_info_history(env.get_info(), episode, tracking, "test")
+            log_info_history(env.get_info(), episode, tracking, "test", commit=False)
+            log_figure_from_history(env.get_info(), episode, tracking, "test", commit=True)
             if ((episode+1) % print_freq) == 0:
                 logging.info(f"Episode {episode+1}: R={R}, J={J}")