Added:

Josiah Laivins · Josiah Laivins · commit ed2d54fa8b62 · 2019-10-05T17:38:44.000-04:00
* Memory RAM size reduction via cleaning on item input.

Fixed:

 * DDPG is stable now. Works on Pendulum as expected / desired

Notes:

 * Now that DDPG works as expected, we will move to preparing repo for
 version 1.0. This will involve testing / CI and passing expected benchmarks.
diff --git a/README.md b/README.md
@@ -173,6 +173,11 @@ could give discrete agents the ability to operate in a continuous domain via bin
     - [X] 0.5.0 DDPG added. let us move
     - [X] 0.5.0 The DDPG paper contains a visualization for Q learning might prove useful. Add to interpreter.
 
+| ![](res/ddpg_balancing.gif) |
+|:----:|
+| *Fig 7: DDPG trains stably now..* |
+
+
 Added q value interpretation per explanation by Lillicrap et al., 2016. Currently both models (DQN and DDPG) have 
 unstable q value approximations. Below is an example from DQN.
 ```python
@@ -184,22 +189,22 @@ a failing one will look globular or horizontal.
 
 | ![](res/dqn_q_estimate_1.jpg) |
 |:----:|
-| *Fig 7: Initial Q Value Estimate. Seems globular which is expected for an initial model.* |
+| *Fig 8: Initial Q Value Estimate. Seems globular which is expected for an initial model.* |
 
 | ![](res/dqn_q_estimate_2.jpg) |
 |:----:|
-| *Fig 8: Seems like the DQN is not learning...* |
+| *Fig 9: Seems like the DQN is not learning...* |
 
 | ![](res/dqn_q_estimate_3.jpg) |
 |:----:|
-| *Fig 9: Alarming later epoch results. It seems that the DQN converges to predicting a single Q value.* |
+| *Fig 10: Alarming later epoch results. It seems that the DQN converges to predicting a single Q value.* |
 
 - [X] 0.6.0 Single Global fit function like Fastai's. Think about the missing batch step. Noted some of the changes to 
 the existing the Fastai 
 
 | ![](res/fit_func_out.jpg) |
 |:----:|
-| *Fig 10: Resulting output of a typical fit function using ref code below.* |
+| *Fig 11: Resulting output of a typical fit function using ref code below.* |
 
 ```python
 from fast_rl.agents.DQN import DuelingDQN
diff --git a/fast_rl/agents/BaseAgent.py b/fast_rl/agents/BaseAgent.py
@@ -46,6 +46,7 @@ def pick_action(self, x):
             if len(x.shape) > 2: raise ValueError('The agent is outputting actions with more than 1 dimension...')
 
             action, x, perturbed = self.exploration_strategy.perturb(x, x, self.data.train_ds.env.action_space)
+            x = np.clip(x, -1.0, 1.0)
 
             if isinstance(self.data.train_ds.env.action_space, Discrete) and not perturbed: action = x.argmax().numpy().item()
             elif isinstance(self.data.train_ds.env.action_space, Box): action = x.squeeze(0).numpy()
@@ -72,7 +73,8 @@ def forward(self, x):
         return x.view(x.size(0), -1)
 
 
-def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True, activation_fuction=None):
+def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
+                    activation_function=None, final_activation_function=None):
     """Generates an nn module.
 
     Notes:
@@ -81,7 +83,7 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
     Returns:
 
     """
-    act = nn.LeakyReLU if activation_fuction is None else activation_fuction
+    act = nn.LeakyReLU if activation_function is None else activation_function
     action_size = action_size[0]  # For now the dimension of the action does not make a difference.
     # For now keep drop out as 0, test including dropout later
     ps = [0] * len(layer_list)
@@ -93,8 +95,11 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
             embedded, n_in = get_embedded(n_in[0], n_out, n_in[1], 5)
             layers += [ToLong(), embedded, Flatten()]
         elif i == 0: n_in = n_in[0]
+        if i == 0 and use_bn: layers += [nn.BatchNorm1d(n_in)]
 
         layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)
+
+    if final_activation_function is not None: layers += [final_activation_function()]
     return nn.Sequential(*layers)
 
 
diff --git a/fast_rl/agents/DDPG.py b/fast_rl/agents/DDPG.py
@@ -1,7 +1,10 @@
+from copy import deepcopy
+
 import torch
 from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone, F
 import numpy as np
 from fastai.metrics import RMSE
+from torch import nn
 from torch.nn import MSELoss
 from torch.optim import Adam
 
@@ -30,7 +33,7 @@ def on_loss_begin(self, **kwargs: Any):
         """Performs memory updates, exploration updates, and model optimization."""
         if self.learn.model.training:
             self.learn.model.memory.update(item=self.learn.data.x.items[-1])
-        self.learn.model.exploration_strategy.update(self.episode, self.max_episodes,
+        self.learn.model.exploration_strategy.update(episode=self.episode, max_episodes=self.max_episodes,
                                                      do_exploration=self.learn.model.training)
         post_optimize = self.learn.model.optimize()
         if self.learn.model.training:
@@ -44,10 +47,31 @@ def on_loss_begin(self, **kwargs: Any):
     #         self.learn.model.target_copy_over()
 
 
+class Critic(nn.Module):
+    def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
+                 activation_function=None):
+        super().__init__()
+        self.action_size = action_size[0]
+        self.state_size = state_size[0]
+
+        self.fc1 = nn.Linear(self.state_size, layer_list[0])
+        self.fc2 = nn.Linear(layer_list[0] + self.action_size, layer_list[1])
+        self.fc3 = nn.Linear(layer_list[1], 1)
+
+    def forward(self, x):
+        action, x = x[:, self.state_size:], x[:, :self.state_size]
+
+        x = nn.LeakyReLU()(self.fc1(x))
+        x = nn.LeakyReLU()(self.fc2(torch.cat((x, action), 1)))
+        x = nn.LeakyReLU()(self.fc3(x))
+
+        return x
+
+
 class DDPG(BaseAgent):
 
-    def __init__(self, data: MDPDataBunch, memory=None, tau=0.001, batch=64, discount=0.99,
-                 lr=0.005, exploration_strategy=None, env_was_discrete=False):
+    def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount=0.99,
+                 lr=1e-3, actor_lr=1e-4, exploration_strategy=None, env_was_discrete=False):
         """
         Implementation of a continuous control algorithm using an actor/critic architecture.
 
@@ -74,42 +98,45 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=0.001, batch=64, discoun
         self.lr = lr
         self.discount = discount
         self.batch = batch
-        self.tao = tau
+        self.tau = 1
         self.memory = ifnone(memory, ExperienceReplay(10000))
 
-        self.action_model = self.initialize_action_model([30, 30], data)
-        self.critic_model = self.initialize_critic_model([30, 30], data)
+        self.action_model = self.initialize_action_model([400, 300], data)
+        self.critic_model = self.initialize_critic_model([400, 300], data)
 
-        self.opt = OptimWrapper.create(Adam, lr=lr, layer_groups=[self.action_model])
+        self.opt = OptimWrapper.create(Adam, lr=actor_lr, layer_groups=[self.action_model])
         self.critic_optimizer = OptimWrapper.create(Adam, lr=lr, layer_groups=[self.critic_model])
 
-        self.t_action_model = self.initialize_action_model([30, 30], data)
-        self.t_critic_model = self.initialize_critic_model([30, 30], data)
+        self.t_action_model = deepcopy(self.action_model)
+        self.t_critic_model = deepcopy(self.critic_model)
 
         self.target_copy_over()
+        self.tau = tau
 
         self.learner_callbacks = [BaseDDPGCallback]
 
-        self.loss_func = F.smooth_l1_loss# MSELoss()
-        # TODO Move to Ornstein-Uhlenbeck process
+        self.loss_func = MSELoss()
+
         self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,
                                                                                decay=0.001,
                                                                                do_exploration=self.training))
 
     def initialize_action_model(self, layers, data):
-        return create_nn_model(layers, *data.get_action_state_size(), True, use_embed=data.train_ds.embeddable)
+        return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable,
+                               final_activation_function=nn.Tanh)
 
     def initialize_critic_model(self, layers, data):
         """ Instead of state -> action, we are going state + action -> single expected reward. """
-        return create_nn_model(layers, (1, 0), (sum([_[0] for _ in data.get_action_state_size()]), 0), True,
-                               use_embed=data.train_ds.embeddable)
+        return Critic(layers, *data.get_action_state_size())
 
     def pick_action(self, x):
         if self.training: self.action_model.eval()
         with torch.no_grad():
-            action = super(DDPG, self).pick_action(x)
+            action, x = super(DDPG, self).pick_action(x)
         if self.training: self.action_model.train()
-        return action
+
+        if not self.env_was_discrete: action = np.clip(action, -1, 1)
+        return action, np.clip(x, -1, 1)
 
     def optimize(self):
         """
@@ -140,16 +167,12 @@ def optimize(self):
 
             y_hat = self.critic_model(torch.cat((s, a), 1))
 
-            critic_loss = self.loss_func(y, y_hat)
-
-            print(f'{y[0][:15]}, {y_hat[0][:15]}')
+            critic_loss = self.loss_func(y_hat, y)
 
             if self.training:
                 # Optimize critic network
                 self.critic_optimizer.zero_grad()
                 critic_loss.backward()
-                for param in self.critic_model.parameters():
-                    param.grad.data.clamp_(-1, 1)
                 self.critic_optimizer.step()
 
             actor_loss = -self.critic_model(torch.cat((s, self.action_model(s)), 1)).mean()
@@ -160,8 +183,6 @@ def optimize(self):
                 # Optimize actor network
                 self.opt.zero_grad()
                 actor_loss.backward()
-                for param in self.action_model.parameters():
-                    param.grad.data.clamp_(-1, 1)
                 self.opt.step()
 
             with torch.no_grad():
@@ -174,8 +195,8 @@ def forward(self, x):
 
     def target_copy_over(self):
         """ Soft target updates the actor and critic models.."""
-        self.soft_target_copy_over(self.t_action_model, self.action_model, self.tao)
-        self.soft_target_copy_over(self.t_critic_model, self.critic_model, self.tao)
+        self.soft_target_copy_over(self.t_action_model, self.action_model, self.tau)
+        self.soft_target_copy_over(self.t_critic_model, self.critic_model, self.tau)
 
     def soft_target_copy_over(self, t_m, f_m, tau):
         for target_param, local_param in zip(t_m.parameters(), f_m.parameters()):
diff --git a/fast_rl/core/MarkovDecisionProcess.py b/fast_rl/core/MarkovDecisionProcess.py
@@ -195,7 +195,7 @@ def __init__(self, env: gym.Env, feed_type=FEED_TYPE_STATE, render='rgb_array',
         self.env = env
         # MDP specific values
         self.actions = self.get_random_action(env.action_space)
-        self.raw_action = np.random.randn((env.action_space.n))
+        self.raw_action = np.random.randn((env.action_space.shape[0])) if isinstance(env.action_space, Box) else np.random.randn((env.action_space.n))
 
         self.is_done = True
         self.current_state = None
@@ -503,10 +503,12 @@ def __init__(self, state, state_prime, alt_state, action, reward, done, episode,
                                'alt_state': self.alternate_state, 'action': action, 'reward': reward, 'done': done,
                                'episode': episode, 'feed_type': feed_type, 'raw_action': raw_action}
 
-    def clean(self):
-        self.current_state = None
-        self.result_state = None
-        self.alternate_state = None
+    def clean(self, only_alt=False):
+        if not only_alt:
+            self.current_state, self.result_state = None, None
+            self.obj['state'], self.obj['state_prime'] = None, None
+
+        self.alternate_state, self.obj['alt_state'] = None, None
 
     def __str__(self):
         formatted = (
diff --git a/fast_rl/core/agent_core.py b/fast_rl/core/agent_core.py
@@ -42,7 +42,7 @@ def perturb(self, action, raw_action, action_space):
         _ = raw_action
         return action, raw_action
 
-    def update(self, episode, max_episodes, do_exploration, **kwargs):
+    def update(self, max_episodes, do_exploration, **kwargs):
         self.do_exploration = do_exploration
 
 
@@ -73,7 +73,7 @@ def perturb(self, action, raw_action, action_space: gym.Space):
         else:
             return action, raw_action, False
 
-    def update(self, current_episode, end_episode=0, **kwargs):
+    def update(self, episode, end_episode=0, **kwargs):
         super(GreedyEpsilon, self).update(**kwargs)
         if self.do_exploration:
             self.end_episode = end_episode
@@ -82,7 +82,7 @@ def update(self, current_episode, end_episode=0, **kwargs):
             self.steps_done += 1
 
 
-class OrnsteinUhlenbeck(ExplorationStrategy):
+class OrnsteinUhlenbeck(GreedyEpsilon):
     def __init__(self, size, mu=0., theta=0.15, sigma=0.2, **kwargs):
         """
 
@@ -108,11 +108,12 @@ def perturb(self, action, raw_action, action_space):
         else: dx = np.zeros(self.x.shape)
 
         self.x += dx
-        return action, torch.from_numpy(self.x).float() + raw_action, False
+        return action, self.epsilon * torch.from_numpy(self.x).float() + raw_action, False
 
 
 class Experience:
-    def __init__(self, memory_size):
+    def __init__(self, memory_size, reduce_ram=False):
+        self.reduce_ram = reduce_ram
         self.max_size = memory_size
         self.callbacks = []
 
@@ -127,7 +128,7 @@ def refresh(self, **kwargs):
 
 
 class ExperienceReplay(Experience):
-    def __init__(self, memory_size):
+    def __init__(self, memory_size, **kwargs):
         """
         Basic store-er of state space transitions for training agents.
 
@@ -138,7 +139,7 @@ def __init__(self, memory_size):
         Args:
             memory_size (int): Max N samples to store
         """
-        super().__init__(memory_size)
+        super().__init__(memory_size, **kwargs)
         self.max_size = memory_size
         self.memory = deque(maxlen=memory_size)  # type: List[MarkovDecisionProcessSlice]
 
@@ -150,6 +151,7 @@ def sample(self, batch, **kwargs):
         return random.sample(self.memory, batch)
 
     def update(self, item, **kwargs):
+        if self.reduce_ram: item.clean(True)
         self.memory.append(copy.deepcopy(item))
 
 
@@ -218,6 +220,7 @@ def update(self, item, **kwargs):
 
         """
         maximal_priority = self.alpha
+        if self.reduce_ram: item.clean(True)
         self.memory.add(np.abs(maximal_priority) + self.epsilon, item)
 
 
diff --git a/fast_rl/util/random_thingy.py b/fast_rl/util/random_thingy.py
@@ -11,18 +11,20 @@
 from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
 
 # data = MDPDataBunch.from_env('Pendulum-v0', render='human')
-from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck
+from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck, ExperienceReplay
+from fast_rl.core.metrics import EpsilonMetric
 
-data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
-# data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
+# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
+data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
 # data = MDPDataBunch.from_env('MountainCarContinuous-v0', render='human', add_valid=False)
-model = DDPG(data, batch=128, lr=0.01, env_was_discrete=True,
-             exploration_strategy=OrnsteinUhlenbeck(4, do_exploration=True))
-learn = AgentLearner(data, model)
-learn.fit(40)
+model = DDPG(data, batch=128, memory=ExperienceReplay(100000, reduce_ram=True),
+             exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1,
+                                                    do_exploration=True, end_episode=450))
+learn = AgentLearner(data, model, metrics=[EpsilonMetric])
+learn.fit(4500)
 
 
 from fast_rl.core.Interpreter import AgentInterpretationAlpha
 
 interp = AgentInterpretationAlpha(learn, DatasetType.Train)
-interp.plot_heatmapped_episode(-1)
+interp.plot_q_density(-1)
diff --git a/res/ddpg_balancing.gif b/res/ddpg_balancing.gif