Synchronous parameter server example. (#1220)

elibol · robertnishihara · commit e066bcf633ad · 2017-11-15T17:49:31.000-08:00
* Synchronous parameter server example.

* Added sync parameter server example to documentation index.

* Consolidate documentation and minor simplifications.

* Fix linting.
diff --git a/doc/source/example-parameter-server.rst b/doc/source/example-parameter-server.rst
@@ -1,8 +1,9 @@
 Parameter Server
 ================
 
-This document walks through how to implement a simple parameter server example
-using actors. To run the application, first install some dependencies.
+This document walks through how to implement simple synchronous and asynchronous
+parameter servers using actors. To run the application, first install some
+dependencies.
 
 .. code-block:: bash
 
@@ -12,17 +13,24 @@ You can view the `code for this example`_.
 
 .. _`code for this example`: https://github.com/ray-project/ray/tree/master/examples/parameter_server
 
-The example can be run as follows.
+The examples can be run as follows.
 
 .. code-block:: bash
 
-  python ray/examples/parameter_server/parameter_server.py --num-workers=4
+  # Run the asynchronous parameter server.
+  python ray/examples/parameter_server/async_parameter_server.py --num-workers=4
+
+  # Run the synchronous parameter server.
+  python ray/examples/parameter_server/sync_parameter_server.py --num-workers=4
 
 Note that this examples uses distributed actor handles, which are still
 considered experimental.
 
-The parameter server itself is implemented as an actor, which exposes the
-methods ``push`` and ``pull``.
+Asynchronous Parameter Server
+-----------------------------
+
+The asynchronous parameter server itself is implemented as an actor, which
+exposes the methods ``push`` and ``pull``.
 
 .. code-block:: python
 
@@ -62,3 +70,58 @@ Then we can create a parameter server and initiate training as follows.
 
   ps = ParameterServer.remote(keys, initial_values)
   worker_tasks = [worker_task.remote(ps) for _ in range(4)]
+
+Synchronous Parameter Server
+----------------------------
+
+The parameter server is implemented as an actor, which exposes the
+methods ``apply_gradients`` and ``get_weights``. A constant linear scaling
+rule is applied by scaling the learning rate by the number of workers.
+
+.. code-block:: python
+
+  @ray.remote
+  class ParameterServer(object):
+      def __init__(self, learning_rate):
+          self.net = model.SimpleCNN(learning_rate=learning_rate)
+
+      def apply_gradients(self, *gradients):
+          self.net.apply_gradients(np.mean(gradients, axis=0))
+          return self.net.variables.get_flat()
+
+      def get_weights(self):
+          return self.net.variables.get_flat()
+
+
+Workers are actors which expose the method ``compute_gradients``.
+
+.. code-block:: python
+
+  @ray.remote
+  class Worker(object):
+      def __init__(self, worker_index, batch_size=50):
+          self.worker_index = worker_index
+          self.batch_size = batch_size
+          self.mnist = input_data.read_data_sets("MNIST_data", one_hot=True,
+                                                 seed=worker_index)
+          self.net = model.SimpleCNN()
+
+      def compute_gradients(self, weights):
+          self.net.variables.set_flat(weights)
+          xs, ys = self.mnist.train.next_batch(self.batch_size)
+          return self.net.compute_gradients(xs, ys)
+
+Training alternates between computing the gradients given the current weights
+from the parameter server and updating the parameter server's weights with the
+resulting gradients.
+
+.. code-block:: python
+
+  while True:
+      gradients = [worker.compute_gradients.remote(current_weights)
+                   for worker in workers]
+      current_weights = ps.apply_gradients.remote(*gradients)
+
+Both of these examples implement the parameter server using a single actor,
+however they can be easily extended to **shard the parameters across multiple
+actors**.
diff --git a/examples/parameter_server/async_parameter_server.py b/examples/parameter_server/async_parameter_server.py
@@ -10,8 +10,8 @@
 
 import model
 
-parser = argparse.ArgumentParser(description="Run the parameter server "
-                                             "example.")
+parser = argparse.ArgumentParser(description="Run the asynchronous parameter "
+                                             "server example.")
 parser.add_argument("--num-workers", default=4, type=int,
                     help="The number of workers to use.")
 parser.add_argument("--redis-address", default=None, type=str,
@@ -35,10 +35,9 @@ def pull(self, keys):
 
 
 @ray.remote
-def worker_task(ps):
+def worker_task(ps, batch_size=50):
     # Download MNIST.
     mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
-    batch_size = 50
 
     # Initialize the model.
     net = model.SimpleCNN()
@@ -55,7 +54,7 @@ def worker_task(ps):
         ps.push.remote(keys, gradients)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parser.parse_args()
 
     ray.init(redis_address=args.redis_address)
diff --git a/examples/parameter_server/model.py b/examples/parameter_server/model.py
@@ -11,7 +11,7 @@
 
 
 class SimpleCNN(object):
-    def __init__(self):
+    def __init__(self, learning_rate=1e-4):
         with tf.Graph().as_default():
 
             # Create the model
@@ -29,7 +29,7 @@ def __init__(self):
             self.cross_entropy = tf.reduce_mean(cross_entropy)
 
             with tf.name_scope('adam_optimizer'):
-                self.optimizer = tf.train.AdamOptimizer(1e-4)
+                self.optimizer = tf.train.AdamOptimizer(learning_rate)
                 self.train_step = self.optimizer.minimize(
                     self.cross_entropy)
 
@@ -51,6 +51,11 @@ def __init__(self):
 
             self.grads = self.optimizer.compute_gradients(
                 self.cross_entropy)
+            self.grads_placeholder = [
+                (tf.placeholder("float", shape=grad[1].get_shape()), grad[1])
+                for grad in self.grads]
+            self.apply_grads_placeholder = self.optimizer.apply_gradients(
+                self.grads_placeholder)
 
     def compute_update(self, x, y):
         # TODO(rkn): Computing the weights before and after the training step
@@ -68,6 +73,12 @@ def compute_gradients(self, x, y):
                                         self.y_: y,
                                         self.keep_prob: 0.5})
 
+    def apply_gradients(self, gradients):
+        feed_dict = {}
+        for i in range(len(self.grads_placeholder)):
+            feed_dict[self.grads_placeholder[i][0]] = gradients[i]
+        self.sess.run(self.apply_grads_placeholder, feed_dict=feed_dict)
+
     def compute_accuracy(self, x, y):
         return self.sess.run(self.accuracy,
                              feed_dict={self.x: x,
diff --git a/examples/parameter_server/sync_parameter_server.py b/examples/parameter_server/sync_parameter_server.py
@@ -0,0 +1,79 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+import numpy as np
+from tensorflow.examples.tutorials.mnist import input_data
+
+import ray
+import model
+
+parser = argparse.ArgumentParser(description="Run the synchronous parameter "
+                                             "server example.")
+parser.add_argument("--num-workers", default=4, type=int,
+                    help="The number of workers to use.")
+parser.add_argument("--redis-address", default=None, type=str,
+                    help="The Redis address of the cluster.")
+
+
+@ray.remote
+class ParameterServer(object):
+    def __init__(self, learning_rate):
+        self.net = model.SimpleCNN(learning_rate=learning_rate)
+
+    def apply_gradients(self, *gradients):
+        self.net.apply_gradients(np.mean(gradients, axis=0))
+        return self.net.variables.get_flat()
+
+    def get_weights(self):
+        return self.net.variables.get_flat()
+
+
+@ray.remote
+class Worker(object):
+    def __init__(self, worker_index, batch_size=50):
+        self.worker_index = worker_index
+        self.batch_size = batch_size
+        self.mnist = input_data.read_data_sets("MNIST_data", one_hot=True,
+                                               seed=worker_index)
+        self.net = model.SimpleCNN()
+
+    def compute_gradients(self, weights):
+        self.net.variables.set_flat(weights)
+        xs, ys = self.mnist.train.next_batch(self.batch_size)
+        return self.net.compute_gradients(xs, ys)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ray.init(redis_address=args.redis_address)
+
+    # Create a parameter server.
+    net = model.SimpleCNN()
+    ps = ParameterServer.remote(1e-4 * args.num_workers)
+
+    # Create workers.
+    workers = [Worker.remote(worker_index)
+               for worker_index in range(args.num_workers)]
+
+    # Download MNIST.
+    mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
+
+    i = 0
+    current_weights = ps.get_weights.remote()
+    while True:
+        # Compute and apply gradients.
+        gradients = [worker.compute_gradients.remote(current_weights)
+                     for worker in workers]
+        current_weights = ps.apply_gradients.remote(*gradients)
+
+        if i % 10 == 0:
+            # Evaluate the current model.
+            net.variables.set_flat(ray.get(current_weights))
+            test_xs, test_ys = mnist.test.next_batch(1000)
+            accuracy = net.compute_accuracy(test_xs, test_ys)
+            print("Iteration {}: accuracy is {}".format(i, accuracy))
+        i += 1