5
5
from unityagents import UnityEnvironmentException
6
6
7
7
8
- def create_agent_model (env , lr = 1e-4 , h_size = 128 , epsilon = 0.2 , beta = 1e-3 , max_step = 5e6 ):
8
+ def create_agent_model (env , lr = 1e-4 , h_size = 128 , epsilon = 0.2 , beta = 1e-3 , max_step = 5e6 , normalize = False , num_layers = 2 ):
9
9
"""
10
10
Takes a Unity environment and model-specific hyper-parameters and returns the
11
11
appropriate PPO agent model for the environment.
@@ -17,12 +17,14 @@ def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_ste
17
17
:return: a sub-class of PPOAgent tailored to the environment.
18
18
:param max_step: Total number of training steps.
19
19
"""
20
+ if num_layers < 1 : num_layers = 1
21
+
20
22
brain_name = env .brain_names [0 ]
21
23
brain = env .brains [brain_name ]
22
24
if brain .action_space_type == "continuous" :
23
- return ContinuousControlModel (lr , brain , h_size , epsilon , max_step )
25
+ return ContinuousControlModel (lr , brain , h_size , epsilon , max_step , normalize , num_layers )
24
26
if brain .action_space_type == "discrete" :
25
- return DiscreteControlModel (lr , brain , h_size , epsilon , beta , max_step )
27
+ return DiscreteControlModel (lr , brain , h_size , epsilon , beta , max_step , normalize , num_layers )
26
28
27
29
28
30
def save_model (sess , saver , model_path = "./" , steps = 0 ):
@@ -57,6 +59,9 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
57
59
58
60
59
61
class PPOModel (object ):
62
+ def __init__ (self ):
63
+ self .normalize = False
64
+
60
65
def create_global_steps (self ):
61
66
"""Creates TF ops to track and increment global training step."""
62
67
self .global_step = tf .Variable (0 , name = "global_step" , trainable = False , dtype = tf .int32 )
@@ -68,7 +73,7 @@ def create_reward_encoder(self):
68
73
self .new_reward = tf .placeholder (shape = [], dtype = tf .float32 , name = 'new_reward' )
69
74
self .update_reward = tf .assign (self .last_reward , self .new_reward )
70
75
71
- def create_visual_encoder (self , o_size_h , o_size_w , bw , h_size , num_streams , activation ):
76
+ def create_visual_encoder (self , o_size_h , o_size_w , bw , h_size , num_streams , activation , num_layers ):
72
77
"""
73
78
Builds a set of visual (CNN) encoders.
74
79
:param o_size_h: Height observation size.
@@ -92,11 +97,13 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
92
97
use_bias = False , activation = activation )
93
98
self .conv2 = tf .layers .conv2d (self .conv1 , 32 , kernel_size = [4 , 4 ], strides = [2 , 2 ],
94
99
use_bias = False , activation = activation )
95
- hidden = tf .layers .dense (c_layers .flatten (self .conv2 ), h_size , use_bias = False , activation = activation )
100
+ hidden = c_layers .flatten (self .conv2 )
101
+ for j in range (num_layers ):
102
+ hidden = tf .layers .dense (hidden , h_size , use_bias = False , activation = activation )
96
103
streams .append (hidden )
97
104
return streams
98
105
99
- def create_continuous_state_encoder (self , s_size , h_size , num_streams , activation ):
106
+ def create_continuous_state_encoder (self , s_size , h_size , num_streams , activation , num_layers ):
100
107
"""
101
108
Builds a set of hidden state encoders.
102
109
:param s_size: state input size.
@@ -107,27 +114,30 @@ def create_continuous_state_encoder(self, s_size, h_size, num_streams, activatio
107
114
"""
108
115
self .state_in = tf .placeholder (shape = [None , s_size ], dtype = tf .float32 , name = 'state' )
109
116
110
- self .running_mean = tf .get_variable ("running_mean" , [s_size ], trainable = False , dtype = tf .float32 ,
111
- initializer = tf .zeros_initializer ())
112
- self .running_variance = tf .get_variable ("running_variance" , [s_size ], trainable = False , dtype = tf .float32 ,
113
- initializer = tf .ones_initializer ())
114
-
115
- self .normalized_state = tf .clip_by_value ((self .state_in - self .running_mean ) / tf .sqrt (
116
- self .running_variance / (tf .cast (self .global_step , tf .float32 ) + 1 )), - 5 , 5 , name = "normalized_state" )
117
+ if self .normalize :
118
+ self .running_mean = tf .get_variable ("running_mean" , [s_size ], trainable = False , dtype = tf .float32 ,
119
+ initializer = tf .zeros_initializer ())
120
+ self .running_variance = tf .get_variable ("running_variance" , [s_size ], trainable = False , dtype = tf .float32 ,
121
+ initializer = tf .ones_initializer ())
117
122
118
- self .new_mean = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_mean' )
119
- self .new_variance = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_variance' )
120
- self .update_mean = tf .assign (self .running_mean , self .new_mean )
121
- self .update_variance = tf .assign (self .running_variance , self .new_variance )
123
+ self .normalized_state = tf .clip_by_value ((self .state_in - self .running_mean ) / tf .sqrt (
124
+ self .running_variance / (tf .cast (self .global_step , tf .float32 ) + 1 )), - 5 , 5 , name = "normalized_state" )
122
125
126
+ self .new_mean = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_mean' )
127
+ self .new_variance = tf .placeholder (shape = [s_size ], dtype = tf .float32 , name = 'new_variance' )
128
+ self .update_mean = tf .assign (self .running_mean , self .new_mean )
129
+ self .update_variance = tf .assign (self .running_variance , self .new_variance )
130
+ else :
131
+ self .normalized_state = self .state_in
123
132
streams = []
124
133
for i in range (num_streams ):
125
- hidden_1 = tf .layers .dense (self .normalized_state , h_size , use_bias = False , activation = activation )
126
- hidden_2 = tf .layers .dense (hidden_1 , h_size , use_bias = False , activation = activation )
127
- streams .append (hidden_2 )
134
+ hidden = self .normalized_state
135
+ for j in range (num_layers ):
136
+ hidden = tf .layers .dense (hidden , h_size , use_bias = False , activation = activation )
137
+ streams .append (hidden )
128
138
return streams
129
139
130
- def create_discrete_state_encoder (self , s_size , h_size , num_streams , activation ):
140
+ def create_discrete_state_encoder (self , s_size , h_size , num_streams , activation , num_layers ):
131
141
"""
132
142
Builds a set of hidden state encoders from discrete state input.
133
143
:param s_size: state input size (discrete).
@@ -140,8 +150,10 @@ def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation)
140
150
state_in = tf .reshape (self .state_in , [- 1 ])
141
151
state_onehot = c_layers .one_hot_encoding (state_in , s_size )
142
152
streams = []
153
+ hidden = state_onehot
143
154
for i in range (num_streams ):
144
- hidden = tf .layers .dense (state_onehot , h_size , use_bias = False , activation = activation )
155
+ for j in range (num_layers ):
156
+ hidden = tf .layers .dense (hidden , h_size , use_bias = False , activation = activation )
145
157
streams .append (hidden )
146
158
return streams
147
159
@@ -186,29 +198,31 @@ def create_ppo_optimizer(self, probs, old_probs, value, entropy, beta, epsilon,
186
198
187
199
188
200
class ContinuousControlModel (PPOModel ):
189
- def __init__ (self , lr , brain , h_size , epsilon , max_step ):
201
+ def __init__ (self , lr , brain , h_size , epsilon , max_step , normalize , num_layers ):
190
202
"""
191
203
Creates Continuous Control Actor-Critic model.
192
204
:param brain: State-space size
193
205
:param h_size: Hidden layer size
194
206
"""
207
+ super ().__init__ ()
195
208
s_size = brain .state_space_size
196
209
a_size = brain .action_space_size
197
210
211
+ self .normalize = normalize
198
212
self .create_global_steps ()
199
213
self .create_reward_encoder ()
200
214
201
215
hidden_state , hidden_visual , hidden_policy , hidden_value = None , None , None , None
202
216
if brain .number_observations > 0 :
203
217
height_size , width_size = brain .camera_resolutions [0 ]['height' ], brain .camera_resolutions [0 ]['width' ]
204
218
bw = brain .camera_resolutions [0 ]['blackAndWhite' ]
205
- hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 2 , tf .nn .tanh )
219
+ hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 2 , tf .nn .tanh , num_layers )
206
220
if brain .state_space_size > 0 :
207
221
s_size = brain .state_space_size
208
222
if brain .state_space_type == "continuous" :
209
- hidden_state = self .create_continuous_state_encoder (s_size , h_size , 2 , tf .nn .tanh )
223
+ hidden_state = self .create_continuous_state_encoder (s_size , h_size , 2 , tf .nn .tanh , num_layers )
210
224
else :
211
- hidden_state = self .create_discrete_state_encoder (s_size , h_size , 2 , tf .nn .tanh )
225
+ hidden_state = self .create_discrete_state_encoder (s_size , h_size , 2 , tf .nn .tanh , num_layers )
212
226
213
227
if hidden_visual is None and hidden_state is None :
214
228
raise Exception ("No valid network configuration possible. "
@@ -249,26 +263,28 @@ def __init__(self, lr, brain, h_size, epsilon, max_step):
249
263
250
264
251
265
class DiscreteControlModel (PPOModel ):
252
- def __init__ (self , lr , brain , h_size , epsilon , beta , max_step ):
266
+ def __init__ (self , lr , brain , h_size , epsilon , beta , max_step , normalize , num_layers ):
253
267
"""
254
268
Creates Discrete Control Actor-Critic model.
255
269
:param brain: State-space size
256
270
:param h_size: Hidden layer size
257
271
"""
272
+ super ().__init__ ()
258
273
self .create_global_steps ()
259
274
self .create_reward_encoder ()
275
+ self .normalize = normalize
260
276
261
277
hidden_state , hidden_visual , hidden = None , None , None
262
278
if brain .number_observations > 0 :
263
279
height_size , width_size = brain .camera_resolutions [0 ]['height' ], brain .camera_resolutions [0 ]['width' ]
264
280
bw = brain .camera_resolutions [0 ]['blackAndWhite' ]
265
- hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 1 , tf .nn .elu )[0 ]
281
+ hidden_visual = self .create_visual_encoder (height_size , width_size , bw , h_size , 1 , tf .nn .elu , num_layers )[0 ]
266
282
if brain .state_space_size > 0 :
267
283
s_size = brain .state_space_size
268
284
if brain .state_space_type == "continuous" :
269
- hidden_state = self .create_continuous_state_encoder (s_size , h_size , 1 , tf .nn .elu )[0 ]
285
+ hidden_state = self .create_continuous_state_encoder (s_size , h_size , 1 , tf .nn .elu , num_layers )[0 ]
270
286
else :
271
- hidden_state = self .create_discrete_state_encoder (s_size , h_size , 1 , tf .nn .elu )[0 ]
287
+ hidden_state = self .create_discrete_state_encoder (s_size , h_size , 1 , tf .nn .elu , num_layers )[0 ]
272
288
273
289
if hidden_visual is None and hidden_state is None :
274
290
raise Exception ("No valid network configuration possible. "
0 commit comments