22import tensorflow as tf
33import tensorflow .contrib .layers as c_layers
44from tensorflow .python .tools import freeze_graph
5+ from unityagents import UnityEnvironmentException
56
67
7- def create_agent_model (env , lr = 1e-4 , h_size = 128 , epsilon = 0.2 , beta = 1e-3 ):
8+ def create_agent_model (env , lr = 1e-4 , h_size = 128 , epsilon = 0.2 , beta = 1e-3 , max_step = 5e6 ):
89 """
910 Takes a Unity environment and model-specific hyperparameters and returns the
1011 appropriate PPO agent model for the environment.
@@ -17,16 +18,23 @@ def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3):
1718 """
1819 brain_name = env .brain_names [0 ]
1920 if env .brains [brain_name ].action_space_type == "continuous" :
20- return ContinuousControlModel (lr , env .brains [brain_name ].state_space_size ,
21- env .brains [brain_name ].action_space_size , h_size , epsilon , beta )
21+ if env .brains [brain_name ].number_observations == 0 :
22+ return ContinuousControlModel (lr , env .brains [brain_name ].state_space_size ,
23+ env .brains [brain_name ].action_space_size , h_size , epsilon , beta , max_step )
24+ else :
25+ raise UnityEnvironmentException ("There is currently no PPO model which supports both a continuous "
26+ "action space and camera observations." )
2227 if env .brains [brain_name ].action_space_type == "discrete" :
2328 if env .brains [brain_name ].number_observations == 0 :
2429 return DiscreteControlModel (lr , env .brains [brain_name ].state_space_size ,
25- env .brains [brain_name ].action_space_size , h_size , epsilon , beta )
30+ env .brains [brain_name ].action_space_size , h_size , epsilon , beta , max_step )
2631 else :
2732 brain = env .brains [brain_name ]
33+ if env .brains [brain_name ].state_space_size > 0 :
34+ print ("This brain contains agents with both observations and states. There is currently no PPO model"
35+ "which supports this. Defaulting to Vision-based PPO model." )
2836 h , w = brain .camera_resolutions [0 ]['height' ], brain .camera_resolutions [0 ]['height' ]
29- return VisualDiscreteControlModel (lr , h , w , env .brains [brain_name ].action_space_size , h_size , epsilon , beta )
37+ return VisualDiscreteControlModel (lr , h , w , env .brains [brain_name ].action_space_size , h_size , epsilon , beta , max_step )
3038
3139
3240def save_model (sess , saver , model_path = "./" , steps = 0 ):
@@ -37,7 +45,7 @@ def save_model(sess, saver, model_path="./", steps=0):
3745 :param steps: Current number of steps in training process.
3846 :param saver: Tensorflow saver for session.
3947 """
40- last_checkpoint = model_path + '/model-' + str (steps )+ '.cptk'
48+ last_checkpoint = model_path + '/model-' + str (steps ) + '.cptk'
4149 saver .save (sess , last_checkpoint )
4250 tf .train .write_graph (sess .graph_def , model_path , 'raw_graph_def.pb' , as_text = False )
4351 print ("Saved Model" )
@@ -61,7 +69,7 @@ def export_graph(model_path, env_name="env", target_nodes="action"):
6169
6270
6371class PPOModel (object ):
64- def __init__ (self , probs , old_probs , value , entropy , beta , epsilon , lr ):
72+ def __init__ (self , probs , old_probs , value , entropy , beta , epsilon , lr , max_step ):
6573 """
6674 Creates training-specific Tensorflow ops for PPO models.
6775 :param probs: Current policy probabilities
@@ -85,15 +93,18 @@ def __init__(self, probs, old_probs, value, entropy, beta, epsilon, lr):
8593
8694 self .loss = self .policy_loss + self .value_loss - beta * tf .reduce_mean (entropy )
8795
88- optimizer = tf .train .AdamOptimizer (learning_rate = lr )
96+ self .global_step = tf .Variable (0 , trainable = False , name = 'global_step' , dtype = tf .int32 )
97+ self .learning_rate = tf .train .polynomial_decay (lr , self .global_step ,
98+ max_step , 1e-10 ,
99+ power = 1.0 )
100+ optimizer = tf .train .AdamOptimizer (learning_rate = self .learning_rate )
89101 self .update_batch = optimizer .minimize (self .loss )
90102
91- self .global_step = tf .Variable (0 , trainable = False , name = 'global_step' , dtype = tf .int32 )
92- self .increment_step = tf .assign (self .global_step , self .global_step + 1 )
103+ self .increment_step = tf .assign (self .global_step , self .global_step + 1 )
93104
94105
95106class ContinuousControlModel (PPOModel ):
96- def __init__ (self , lr , s_size , a_size , h_size , epsilon , beta ):
107+ def __init__ (self , lr , s_size , a_size , h_size , epsilon , beta , max_step ):
97108 """
98109 Creates Continuous Control Actor-Critic model.
99110 :param s_size: State-space size
@@ -127,11 +138,11 @@ def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
127138
128139 self .old_probs = tf .placeholder (shape = [None , a_size ], dtype = tf .float32 , name = 'old_probabilities' )
129140
130- PPOModel .__init__ (self , self .probs , self .old_probs , self .value , self .entropy , 0.0 , epsilon , lr )
141+ PPOModel .__init__ (self , self .probs , self .old_probs , self .value , self .entropy , 0.0 , epsilon , lr , max_step )
131142
132143
133144class DiscreteControlModel (PPOModel ):
134- def __init__ (self , lr , s_size , a_size , h_size , epsilon , beta ):
145+ def __init__ (self , lr , s_size , a_size , h_size , epsilon , beta , max_step ):
135146 """
136147 Creates Discrete Control Actor-Critic model.
137148 :param s_size: State-space size
@@ -158,11 +169,11 @@ def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
158169 self .old_responsible_probs = tf .reduce_sum (self .old_probs * self .selected_actions , axis = 1 )
159170
160171 PPOModel .__init__ (self , self .responsible_probs , self .old_responsible_probs ,
161- self .value , self .entropy , beta , epsilon , lr )
172+ self .value , self .entropy , beta , epsilon , lr , max_step )
162173
163174
164175class VisualDiscreteControlModel (PPOModel ):
165- def __init__ (self , lr , o_size_h , o_size_w , a_size , h_size , epsilon , beta ):
176+ def __init__ (self , lr , o_size_h , o_size_w , a_size , h_size , epsilon , beta , max_step ):
166177 """
167178 Creates Discrete Control Actor-Critic model for use with visual observations (images).
168179 :param o_size_h: Observation height.
@@ -194,4 +205,4 @@ def __init__(self, lr, o_size_h, o_size_w, a_size, h_size, epsilon, beta):
194205 self .old_responsible_probs = tf .reduce_sum (self .old_probs * self .selected_actions , axis = 1 )
195206
196207 PPOModel .__init__ (self , self .responsible_probs , self .old_responsible_probs ,
197- self .value , self .entropy , beta , epsilon , lr )
208+ self .value , self .entropy , beta , epsilon , lr , max_step )
0 commit comments