Add flags for normalization and variable layers

awjuliani · awjuliani · commit f55ddc435a91 · 2017-12-04T21:53:28.000-08:00
diff --git a/python/PPO.ipynb b/python/PPO.ipynb
@@ -58,11 +58,13 @@
     "time_horizon = 2048 # How many steps to collect per agent before adding to buffer.\n",
     "beta = 1e-3 # Strength of entropy regularization\n",
     "num_epoch = 5 # Number of gradient descent steps per batch of experiences.\n",
+    "num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.\n",
     "epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.\n",
     "buffer_size = 2048 # How large the experience buffer should be before gradient descent.\n",
     "learning_rate = 3e-4 # Model learning rate.\n",
     "hidden_units = 64 # Number of units in hidden layer.\n",
     "batch_size = 64 # How many experiences per gradient descent update step.\n",
+    "normalize = False\n",
     "\n",
     "### Logging dictionary for hyperparameters\n",
     "hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,\n",
@@ -81,7 +83,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file)\n",
@@ -100,6 +104,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
+    "collapsed": true,
     "scrolled": true
    },
    "outputs": [],
@@ -124,7 +129,8 @@
     "# Create the Tensorflow model graph\n",
     "ppo_model = create_agent_model(env, lr=learning_rate,\n",
     "                               h_size=hidden_units, epsilon=epsilon,\n",
-    "                               beta=beta, max_step=max_steps)\n",
+    "                               beta=beta, max_step=max_steps, \n",
+    "                               normalize=normalize, num_layers=num_layers)\n",
     "\n",
     "is_continuous = (env.brains[brain_name].action_space_type == \"continuous\")\n",
     "use_observations = (env.brains[brain_name].number_observations > 0)\n",
@@ -160,7 +166,7 @@
     "        if env.global_done:\n",
     "            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]\n",
     "        # Decide and take an action\n",
-    "        new_info = trainer.take_action(info, env, brain_name, steps)\n",
+    "        new_info = trainer.take_action(info, env, brain_name, steps, normalize)\n",
     "        info = new_info\n",
     "        trainer.process_experiences(info, time_horizon, gamma, lambd)\n",
     "        if len(trainer.training_buffer['actions']) > buffer_size and train_model:\n",
@@ -208,21 +214,21 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
+   "pygments_lexer": "ipython3",
+   "version": "3.6.2"
   }
  },
  "nbformat": 4,
diff --git a/python/ppo.py b/python/ppo.py
@@ -27,7 +27,9 @@
   --learning-rate=<rate>     Model learning rate [default: 3e-4].
   --load                     Whether to load the model or randomly initialize [default: False].
   --max-steps=<n>            Maximum number of steps to run environment [default: 1e6].
+  --normalize                Whether to normalize the state input using running statistics [default: False].
   --num-epoch=<n>            Number of gradient descent steps per batch of experiences [default: 5].
+  --num-layers=<n>           Number of hidden layers between state/observation and outputs [default: 2].
   --run-path=<path>          The sub-directory name for model and summary statistics [default: ppo].
   --save-freq=<n>            Frequency at which to save model [default: 50000].
   --summary-freq=<n>         Frequency at which to save training statistics [default: 10000].
@@ -60,11 +62,13 @@
 time_horizon = int(options['--time-horizon'])
 beta = float(options['--beta'])
 num_epoch = int(options['--num-epoch'])
+num_layers = int(options['--num-layers'])
 epsilon = float(options['--epsilon'])
 buffer_size = int(options['--buffer-size'])
 learning_rate = float(options['--learning-rate'])
 hidden_units = int(options['--hidden-units'])
 batch_size = int(options['--batch-size'])
+normalize = options['--normalize']
 
 env = UnityEnvironment(file_name=env_name, worker_id=worker_id, curriculum=curriculum_file)
 print(str(env))
@@ -75,7 +79,8 @@
 # Create the Tensorflow model graph
 ppo_model = create_agent_model(env, lr=learning_rate,
                                h_size=hidden_units, epsilon=epsilon,
-                               beta=beta, max_step=max_steps)
+                               beta=beta, max_step=max_steps,
+                               normalize=normalize, num_layers=num_layers)
 
 is_continuous = (env.brains[brain_name].action_space_type == "continuous")
 use_observations = (env.brains[brain_name].number_observations > 0)
@@ -124,7 +129,7 @@ def get_progress():
             info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
             trainer.reset_buffers(info, total=True)
         # Decide and take an action
-        new_info = trainer.take_action(info, env, brain_name, steps)
+        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
         info = new_info
         trainer.process_experiences(info, time_horizon, gamma, lambd)
         if len(trainer.training_buffer['actions']) > buffer_size and train_model:
diff --git a/python/ppo/models.py b/python/ppo/models.py
@@ -5,7 +5,7 @@
 from unityagents import UnityEnvironmentException
 
 
-def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6):
+def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, num_layers=2):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
@@ -17,12 +17,14 @@ def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_ste
     :return: a sub-class of PPOAgent tailored to the environment.
     :param max_step: Total number of training steps.
     """
+    if num_layers < 1: num_layers = 1
+
     brain_name = env.brain_names[0]
     brain = env.brains[brain_name]
     if brain.action_space_type == "continuous":
-        return ContinuousControlModel(lr, brain, h_size, epsilon, max_step)
+        return ContinuousControlModel(lr, brain, h_size, epsilon, max_step, normalize, num_layers)
     if brain.action_space_type == "discrete":
-        return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step)
+        return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers)
 
 
 def save_model(sess, saver, model_path="./", steps=0):
@@ -57,6 +59,9 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
 
 
 class PPOModel(object):
+    def __init__(self):
+        self.normalize = False
+
     def create_global_steps(self):
         """Creates TF ops to track and increment global training step."""
         self.global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32)
@@ -68,7 +73,7 @@ def create_reward_encoder(self):
         self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
         self.update_reward = tf.assign(self.last_reward, self.new_reward)
 
-    def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation):
+    def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation, num_layers):
         """
         Builds a set of visual (CNN) encoders.
         :param o_size_h: Height observation size.
@@ -92,11 +97,13 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
                                           use_bias=False, activation=activation)
             self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
                                           use_bias=False, activation=activation)
-            hidden = tf.layers.dense(c_layers.flatten(self.conv2), h_size, use_bias=False, activation=activation)
+            hidden = c_layers.flatten(self.conv2)
+            for j in range(num_layers):
+                hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
             streams.append(hidden)
         return streams
 
-    def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation):
+    def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):
         """
         Builds a set of hidden state encoders.
         :param s_size: state input size.
@@ -107,27 +114,30 @@ def create_continuous_state_encoder(self, s_size, h_size, num_streams, activatio
         """
         self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32, name='state')
 
-        self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32,
-                                            initializer=tf.zeros_initializer())
-        self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32,
-                                                initializer=tf.ones_initializer())
-
-        self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt(
-            self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state")
+        if self.normalize:
+            self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32,
+                                                initializer=tf.zeros_initializer())
+            self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32,
+                                                    initializer=tf.ones_initializer())
 
-        self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean')
-        self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance')
-        self.update_mean = tf.assign(self.running_mean, self.new_mean)
-        self.update_variance = tf.assign(self.running_variance, self.new_variance)
+            self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt(
+                self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state")
 
+            self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean')
+            self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance')
+            self.update_mean = tf.assign(self.running_mean, self.new_mean)
+            self.update_variance = tf.assign(self.running_variance, self.new_variance)
+        else:
+            self.normalized_state = self.state_in
         streams = []
         for i in range(num_streams):
-            hidden_1 = tf.layers.dense(self.normalized_state, h_size, use_bias=False, activation=activation)
-            hidden_2 = tf.layers.dense(hidden_1, h_size, use_bias=False, activation=activation)
-            streams.append(hidden_2)
+            hidden = self.normalized_state
+            for j in range(num_layers):
+                hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
+            streams.append(hidden)
         return streams
 
-    def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation):
+    def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):
         """
         Builds a set of hidden state encoders from discrete state input.
         :param s_size: state input size (discrete).
@@ -140,8 +150,10 @@ def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation)
         state_in = tf.reshape(self.state_in, [-1])
         state_onehot = c_layers.one_hot_encoding(state_in, s_size)
         streams = []
+        hidden = state_onehot
         for i in range(num_streams):
-            hidden = tf.layers.dense(state_onehot, h_size, use_bias=False, activation=activation)
+            for j in range(num_layers):
+                hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
             streams.append(hidden)
         return streams
 
@@ -186,29 +198,31 @@ def create_ppo_optimizer(self, probs, old_probs, value, entropy, beta, epsilon,
 
 
 class ContinuousControlModel(PPOModel):
-    def __init__(self, lr, brain, h_size, epsilon, max_step):
+    def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
         """
         Creates Continuous Control Actor-Critic model.
         :param brain: State-space size
         :param h_size: Hidden layer size
         """
+        super().__init__()
         s_size = brain.state_space_size
         a_size = brain.action_space_size
 
+        self.normalize = normalize
         self.create_global_steps()
         self.create_reward_encoder()
 
         hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
         if brain.number_observations > 0:
             height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
             bw = brain.camera_resolutions[0]['blackAndWhite']
-            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh)
+            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
         if brain.state_space_size > 0:
             s_size = brain.state_space_size
             if brain.state_space_type == "continuous":
-                hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh)
+                hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
             else:
-                hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh)
+                hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
 
         if hidden_visual is None and hidden_state is None:
             raise Exception("No valid network configuration possible. "
@@ -249,26 +263,28 @@ def __init__(self, lr, brain, h_size, epsilon, max_step):
 
 
 class DiscreteControlModel(PPOModel):
-    def __init__(self, lr, brain, h_size, epsilon, beta, max_step):
+    def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers):
         """
         Creates Discrete Control Actor-Critic model.
         :param brain: State-space size
         :param h_size: Hidden layer size
         """
+        super().__init__()
         self.create_global_steps()
         self.create_reward_encoder()
+        self.normalize = normalize
 
         hidden_state, hidden_visual, hidden = None, None, None
         if brain.number_observations > 0:
             height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
             bw = brain.camera_resolutions[0]['blackAndWhite']
-            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu)[0]
+            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
         if brain.state_space_size > 0:
             s_size = brain.state_space_size
             if brain.state_space_type == "continuous":
-                hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu)[0]
+                hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
             else:
-                hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu)[0]
+                hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
 
         if hidden_visual is None and hidden_state is None:
             raise Exception("No valid network configuration possible. "
diff --git a/python/ppo/trainer.py b/python/ppo/trainer.py
@@ -41,7 +41,7 @@ def running_average(self, data, steps, running_mean, running_variance):
         new_variance = var + (current_x - new_mean) * (current_x - mean)
         return new_mean, new_variance
 
-    def take_action(self, info, env, brain_name, steps):
+    def take_action(self, info, env, brain_name, steps, normalize):
         """
         Decides actions given state/observation information, and takes them in environment.
         :param info: Current BrainInfo from environment.
@@ -60,7 +60,7 @@ def take_action(self, info, env, brain_name, steps):
             feed_dict[self.model.observation_in] = np.vstack(info.observations)
         if self.use_states:
             feed_dict[self.model.state_in] = info.states
-        if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states:
+        if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
             new_mean, new_variance = self.running_average(info.states, steps, self.model.running_mean,
                                                           self.model.running_variance)
             feed_dict[self.model.new_mean] = new_mean