Skip to content

Commit ed2d54f

Browse files
author
Josiah Laivins
committed
Added:
* Memory RAM size reduction via cleaning on item input. Fixed: * DDPG is stable now. Works on Pendulum as expected / desired Notes: * Now that DDPG works as expected, we will move to preparing repo for version 1.0. This will involve testing / CI and passing expected benchmarks.
1 parent fea9a2e commit ed2d54f

File tree

7 files changed

+89
-51
lines changed

7 files changed

+89
-51
lines changed

README.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,11 @@ could give discrete agents the ability to operate in a continuous domain via bin
173173
- [X] 0.5.0 DDPG added. let us move
174174
- [X] 0.5.0 The DDPG paper contains a visualization for Q learning might prove useful. Add to interpreter.
175175

176+
| ![](res/ddpg_balancing.gif) |
177+
|:----:|
178+
| *Fig 7: DDPG trains stably now..* |
179+
180+
176181
Added q value interpretation per explanation by Lillicrap et al., 2016. Currently both models (DQN and DDPG) have
177182
unstable q value approximations. Below is an example from DQN.
178183
```python
@@ -184,22 +189,22 @@ a failing one will look globular or horizontal.
184189

185190
| ![](res/dqn_q_estimate_1.jpg) |
186191
|:----:|
187-
| *Fig 7: Initial Q Value Estimate. Seems globular which is expected for an initial model.* |
192+
| *Fig 8: Initial Q Value Estimate. Seems globular which is expected for an initial model.* |
188193

189194
| ![](res/dqn_q_estimate_2.jpg) |
190195
|:----:|
191-
| *Fig 8: Seems like the DQN is not learning...* |
196+
| *Fig 9: Seems like the DQN is not learning...* |
192197

193198
| ![](res/dqn_q_estimate_3.jpg) |
194199
|:----:|
195-
| *Fig 9: Alarming later epoch results. It seems that the DQN converges to predicting a single Q value.* |
200+
| *Fig 10: Alarming later epoch results. It seems that the DQN converges to predicting a single Q value.* |
196201

197202
- [X] 0.6.0 Single Global fit function like Fastai's. Think about the missing batch step. Noted some of the changes to
198203
the existing the Fastai
199204

200205
| ![](res/fit_func_out.jpg) |
201206
|:----:|
202-
| *Fig 10: Resulting output of a typical fit function using ref code below.* |
207+
| *Fig 11: Resulting output of a typical fit function using ref code below.* |
203208

204209
```python
205210
from fast_rl.agents.DQN import DuelingDQN

fast_rl/agents/BaseAgent.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def pick_action(self, x):
4646
if len(x.shape) > 2: raise ValueError('The agent is outputting actions with more than 1 dimension...')
4747

4848
action, x, perturbed = self.exploration_strategy.perturb(x, x, self.data.train_ds.env.action_space)
49+
x = np.clip(x, -1.0, 1.0)
4950

5051
if isinstance(self.data.train_ds.env.action_space, Discrete) and not perturbed: action = x.argmax().numpy().item()
5152
elif isinstance(self.data.train_ds.env.action_space, Box): action = x.squeeze(0).numpy()
@@ -72,7 +73,8 @@ def forward(self, x):
7273
return x.view(x.size(0), -1)
7374

7475

75-
def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True, activation_fuction=None):
76+
def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
77+
activation_function=None, final_activation_function=None):
7678
"""Generates an nn module.
7779
7880
Notes:
@@ -81,7 +83,7 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
8183
Returns:
8284
8385
"""
84-
act = nn.LeakyReLU if activation_fuction is None else activation_fuction
86+
act = nn.LeakyReLU if activation_function is None else activation_function
8587
action_size = action_size[0] # For now the dimension of the action does not make a difference.
8688
# For now keep drop out as 0, test including dropout later
8789
ps = [0] * len(layer_list)
@@ -93,8 +95,11 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
9395
embedded, n_in = get_embedded(n_in[0], n_out, n_in[1], 5)
9496
layers += [ToLong(), embedded, Flatten()]
9597
elif i == 0: n_in = n_in[0]
98+
if i == 0 and use_bn: layers += [nn.BatchNorm1d(n_in)]
9699

97100
layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)
101+
102+
if final_activation_function is not None: layers += [final_activation_function()]
98103
return nn.Sequential(*layers)
99104

100105

fast_rl/agents/DDPG.py

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
from copy import deepcopy
2+
13
import torch
24
from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone, F
35
import numpy as np
46
from fastai.metrics import RMSE
7+
from torch import nn
58
from torch.nn import MSELoss
69
from torch.optim import Adam
710

@@ -30,7 +33,7 @@ def on_loss_begin(self, **kwargs: Any):
3033
"""Performs memory updates, exploration updates, and model optimization."""
3134
if self.learn.model.training:
3235
self.learn.model.memory.update(item=self.learn.data.x.items[-1])
33-
self.learn.model.exploration_strategy.update(self.episode, self.max_episodes,
36+
self.learn.model.exploration_strategy.update(episode=self.episode, max_episodes=self.max_episodes,
3437
do_exploration=self.learn.model.training)
3538
post_optimize = self.learn.model.optimize()
3639
if self.learn.model.training:
@@ -44,10 +47,31 @@ def on_loss_begin(self, **kwargs: Any):
4447
# self.learn.model.target_copy_over()
4548

4649

50+
class Critic(nn.Module):
51+
def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
52+
activation_function=None):
53+
super().__init__()
54+
self.action_size = action_size[0]
55+
self.state_size = state_size[0]
56+
57+
self.fc1 = nn.Linear(self.state_size, layer_list[0])
58+
self.fc2 = nn.Linear(layer_list[0] + self.action_size, layer_list[1])
59+
self.fc3 = nn.Linear(layer_list[1], 1)
60+
61+
def forward(self, x):
62+
action, x = x[:, self.state_size:], x[:, :self.state_size]
63+
64+
x = nn.LeakyReLU()(self.fc1(x))
65+
x = nn.LeakyReLU()(self.fc2(torch.cat((x, action), 1)))
66+
x = nn.LeakyReLU()(self.fc3(x))
67+
68+
return x
69+
70+
4771
class DDPG(BaseAgent):
4872

49-
def __init__(self, data: MDPDataBunch, memory=None, tau=0.001, batch=64, discount=0.99,
50-
lr=0.005, exploration_strategy=None, env_was_discrete=False):
73+
def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount=0.99,
74+
lr=1e-3, actor_lr=1e-4, exploration_strategy=None, env_was_discrete=False):
5175
"""
5276
Implementation of a continuous control algorithm using an actor/critic architecture.
5377
@@ -74,42 +98,45 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=0.001, batch=64, discoun
7498
self.lr = lr
7599
self.discount = discount
76100
self.batch = batch
77-
self.tao = tau
101+
self.tau = 1
78102
self.memory = ifnone(memory, ExperienceReplay(10000))
79103

80-
self.action_model = self.initialize_action_model([30, 30], data)
81-
self.critic_model = self.initialize_critic_model([30, 30], data)
104+
self.action_model = self.initialize_action_model([400, 300], data)
105+
self.critic_model = self.initialize_critic_model([400, 300], data)
82106

83-
self.opt = OptimWrapper.create(Adam, lr=lr, layer_groups=[self.action_model])
107+
self.opt = OptimWrapper.create(Adam, lr=actor_lr, layer_groups=[self.action_model])
84108
self.critic_optimizer = OptimWrapper.create(Adam, lr=lr, layer_groups=[self.critic_model])
85109

86-
self.t_action_model = self.initialize_action_model([30, 30], data)
87-
self.t_critic_model = self.initialize_critic_model([30, 30], data)
110+
self.t_action_model = deepcopy(self.action_model)
111+
self.t_critic_model = deepcopy(self.critic_model)
88112

89113
self.target_copy_over()
114+
self.tau = tau
90115

91116
self.learner_callbacks = [BaseDDPGCallback]
92117

93-
self.loss_func = F.smooth_l1_loss# MSELoss()
94-
# TODO Move to Ornstein-Uhlenbeck process
118+
self.loss_func = MSELoss()
119+
95120
self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,
96121
decay=0.001,
97122
do_exploration=self.training))
98123

99124
def initialize_action_model(self, layers, data):
100-
return create_nn_model(layers, *data.get_action_state_size(), True, use_embed=data.train_ds.embeddable)
125+
return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable,
126+
final_activation_function=nn.Tanh)
101127

102128
def initialize_critic_model(self, layers, data):
103129
""" Instead of state -> action, we are going state + action -> single expected reward. """
104-
return create_nn_model(layers, (1, 0), (sum([_[0] for _ in data.get_action_state_size()]), 0), True,
105-
use_embed=data.train_ds.embeddable)
130+
return Critic(layers, *data.get_action_state_size())
106131

107132
def pick_action(self, x):
108133
if self.training: self.action_model.eval()
109134
with torch.no_grad():
110-
action = super(DDPG, self).pick_action(x)
135+
action, x = super(DDPG, self).pick_action(x)
111136
if self.training: self.action_model.train()
112-
return action
137+
138+
if not self.env_was_discrete: action = np.clip(action, -1, 1)
139+
return action, np.clip(x, -1, 1)
113140

114141
def optimize(self):
115142
"""
@@ -140,16 +167,12 @@ def optimize(self):
140167

141168
y_hat = self.critic_model(torch.cat((s, a), 1))
142169

143-
critic_loss = self.loss_func(y, y_hat)
144-
145-
print(f'{y[0][:15]}, {y_hat[0][:15]}')
170+
critic_loss = self.loss_func(y_hat, y)
146171

147172
if self.training:
148173
# Optimize critic network
149174
self.critic_optimizer.zero_grad()
150175
critic_loss.backward()
151-
for param in self.critic_model.parameters():
152-
param.grad.data.clamp_(-1, 1)
153176
self.critic_optimizer.step()
154177

155178
actor_loss = -self.critic_model(torch.cat((s, self.action_model(s)), 1)).mean()
@@ -160,8 +183,6 @@ def optimize(self):
160183
# Optimize actor network
161184
self.opt.zero_grad()
162185
actor_loss.backward()
163-
for param in self.action_model.parameters():
164-
param.grad.data.clamp_(-1, 1)
165186
self.opt.step()
166187

167188
with torch.no_grad():
@@ -174,8 +195,8 @@ def forward(self, x):
174195

175196
def target_copy_over(self):
176197
""" Soft target updates the actor and critic models.."""
177-
self.soft_target_copy_over(self.t_action_model, self.action_model, self.tao)
178-
self.soft_target_copy_over(self.t_critic_model, self.critic_model, self.tao)
198+
self.soft_target_copy_over(self.t_action_model, self.action_model, self.tau)
199+
self.soft_target_copy_over(self.t_critic_model, self.critic_model, self.tau)
179200

180201
def soft_target_copy_over(self, t_m, f_m, tau):
181202
for target_param, local_param in zip(t_m.parameters(), f_m.parameters()):

fast_rl/core/MarkovDecisionProcess.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def __init__(self, env: gym.Env, feed_type=FEED_TYPE_STATE, render='rgb_array',
195195
self.env = env
196196
# MDP specific values
197197
self.actions = self.get_random_action(env.action_space)
198-
self.raw_action = np.random.randn((env.action_space.n))
198+
self.raw_action = np.random.randn((env.action_space.shape[0])) if isinstance(env.action_space, Box) else np.random.randn((env.action_space.n))
199199

200200
self.is_done = True
201201
self.current_state = None
@@ -503,10 +503,12 @@ def __init__(self, state, state_prime, alt_state, action, reward, done, episode,
503503
'alt_state': self.alternate_state, 'action': action, 'reward': reward, 'done': done,
504504
'episode': episode, 'feed_type': feed_type, 'raw_action': raw_action}
505505

506-
def clean(self):
507-
self.current_state = None
508-
self.result_state = None
509-
self.alternate_state = None
506+
def clean(self, only_alt=False):
507+
if not only_alt:
508+
self.current_state, self.result_state = None, None
509+
self.obj['state'], self.obj['state_prime'] = None, None
510+
511+
self.alternate_state, self.obj['alt_state'] = None, None
510512

511513
def __str__(self):
512514
formatted = (

fast_rl/core/agent_core.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def perturb(self, action, raw_action, action_space):
4242
_ = raw_action
4343
return action, raw_action
4444

45-
def update(self, episode, max_episodes, do_exploration, **kwargs):
45+
def update(self, max_episodes, do_exploration, **kwargs):
4646
self.do_exploration = do_exploration
4747

4848

@@ -73,7 +73,7 @@ def perturb(self, action, raw_action, action_space: gym.Space):
7373
else:
7474
return action, raw_action, False
7575

76-
def update(self, current_episode, end_episode=0, **kwargs):
76+
def update(self, episode, end_episode=0, **kwargs):
7777
super(GreedyEpsilon, self).update(**kwargs)
7878
if self.do_exploration:
7979
self.end_episode = end_episode
@@ -82,7 +82,7 @@ def update(self, current_episode, end_episode=0, **kwargs):
8282
self.steps_done += 1
8383

8484

85-
class OrnsteinUhlenbeck(ExplorationStrategy):
85+
class OrnsteinUhlenbeck(GreedyEpsilon):
8686
def __init__(self, size, mu=0., theta=0.15, sigma=0.2, **kwargs):
8787
"""
8888
@@ -108,11 +108,12 @@ def perturb(self, action, raw_action, action_space):
108108
else: dx = np.zeros(self.x.shape)
109109

110110
self.x += dx
111-
return action, torch.from_numpy(self.x).float() + raw_action, False
111+
return action, self.epsilon * torch.from_numpy(self.x).float() + raw_action, False
112112

113113

114114
class Experience:
115-
def __init__(self, memory_size):
115+
def __init__(self, memory_size, reduce_ram=False):
116+
self.reduce_ram = reduce_ram
116117
self.max_size = memory_size
117118
self.callbacks = []
118119

@@ -127,7 +128,7 @@ def refresh(self, **kwargs):
127128

128129

129130
class ExperienceReplay(Experience):
130-
def __init__(self, memory_size):
131+
def __init__(self, memory_size, **kwargs):
131132
"""
132133
Basic store-er of state space transitions for training agents.
133134
@@ -138,7 +139,7 @@ def __init__(self, memory_size):
138139
Args:
139140
memory_size (int): Max N samples to store
140141
"""
141-
super().__init__(memory_size)
142+
super().__init__(memory_size, **kwargs)
142143
self.max_size = memory_size
143144
self.memory = deque(maxlen=memory_size) # type: List[MarkovDecisionProcessSlice]
144145

@@ -150,6 +151,7 @@ def sample(self, batch, **kwargs):
150151
return random.sample(self.memory, batch)
151152

152153
def update(self, item, **kwargs):
154+
if self.reduce_ram: item.clean(True)
153155
self.memory.append(copy.deepcopy(item))
154156

155157

@@ -218,6 +220,7 @@ def update(self, item, **kwargs):
218220
219221
"""
220222
maximal_priority = self.alpha
223+
if self.reduce_ram: item.clean(True)
221224
self.memory.add(np.abs(maximal_priority) + self.epsilon, item)
222225

223226

fast_rl/util/random_thingy.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,20 @@
1111
from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
1212

1313
# data = MDPDataBunch.from_env('Pendulum-v0', render='human')
14-
from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck
14+
from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck, ExperienceReplay
15+
from fast_rl.core.metrics import EpsilonMetric
1516

16-
data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
17-
# data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
17+
# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
18+
data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
1819
# data = MDPDataBunch.from_env('MountainCarContinuous-v0', render='human', add_valid=False)
19-
model = DDPG(data, batch=128, lr=0.01, env_was_discrete=True,
20-
exploration_strategy=OrnsteinUhlenbeck(4, do_exploration=True))
21-
learn = AgentLearner(data, model)
22-
learn.fit(40)
20+
model = DDPG(data, batch=128, memory=ExperienceReplay(100000, reduce_ram=True),
21+
exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1,
22+
do_exploration=True, end_episode=450))
23+
learn = AgentLearner(data, model, metrics=[EpsilonMetric])
24+
learn.fit(4500)
2325

2426

2527
from fast_rl.core.Interpreter import AgentInterpretationAlpha
2628

2729
interp = AgentInterpretationAlpha(learn, DatasetType.Train)
28-
interp.plot_heatmapped_episode(-1)
30+
interp.plot_q_density(-1)

res/ddpg_balancing.gif

295 KB
Loading

0 commit comments

Comments
 (0)