yahoo
diff --git a/‎tensorflowonspark/TFCluster.py‎
Lines changed: 19 additions & 6 deletions b/‎tensorflowonspark/TFCluster.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎tensorflowonspark/TFNode.py‎
Lines changed: 22 additions & 11 deletions b/‎tensorflowonspark/TFNode.py‎
Lines changed: 22 additions & 11 deletions
@@ -36,11 +36,13 @@
 # status of TF background job
 tf_status = {}
 
+
 class InputMode(object):
   """Enum for the input modes of data feeding."""
   TENSORFLOW = 0                #: TensorFlow application is responsible for reading any data.
   SPARK = 1                     #: Spark is responsible for feeding data to the TensorFlow application via an RDD.
 
+
 class TFCluster(object):
 
   sc = None                     #: SparkContext
@@ -197,8 +199,9 @@ def tensorboard_url(self):
         tb_url = "http://{0}:{1}".format(node['host'], node['tb_port'])
     return tb_url
 
+
 def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=InputMode.TENSORFLOW,
-        log_dir=None, driver_ps_nodes=False, reservation_timeout=600, queues=['input', 'output', 'error']):
+        log_dir=None, driver_ps_nodes=False, master_node=None, reservation_timeout=600, queues=['input', 'output', 'error']):
   """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors
 
   Args:
@@ -211,6 +214,7 @@ def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mo
     :input_mode: TFCluster.InputMode
     :log_dir: directory to save tensorboard event logs.  If None, defaults to a fixed path on local filesystem.
     :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
+    :master_node: name of the "master" or "chief" node in the cluster_template, used for `tf.estimator` applications.
     :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default)
     :queues: *INTERNAL_USE*
 
@@ -226,8 +230,13 @@ def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mo
   # build a cluster_spec template using worker_nums
   cluster_template = {}
   cluster_template['ps'] = range(num_ps)
-  cluster_template['worker'] = range(num_ps, num_executors)
-  logging.info("worker node range %s, ps node range %s" % (cluster_template['worker'], cluster_template['ps']))
+  if master_node is None:
+    cluster_template['worker'] = range(num_ps, num_executors)
+  else:
+    cluster_template[master_node] = range(num_ps, num_ps + 1)
+    if num_executors > num_ps + 1:
+      cluster_template['worker'] = range(num_ps + 1, num_executors)
+  logging.info("cluster_template: {}".format(cluster_template))
 
   # get default filesystem from spark
   defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
@@ -311,13 +320,17 @@ def _start(status):
     logging.info("")
     logging.info("========================================================================================")
 
-  # since our "primary key" for each executor's TFManager is (host, ppid), sanity check for duplicates
+  # since our "primary key" for each executor's TFManager is (host, executor_id), sanity check for duplicates
   # Note: this may occur if Spark retries failed Python tasks on the same executor.
   tb_nodes = set()
   for node in cluster_info:
-    node_id = (node['host'],node['ppid'])
+    node_id = (node['host'], node['executor_id'])
     if node_id in tb_nodes:
-      raise Exception("Duplicate cluster node id detected (host={0}, ppid={1}).  Please ensure that (1) the number of executors >= number of TensorFlow nodes, (2) the number of tasks per executors == 1, and (3) TFCluster.shutdown() is successfully invoked when done.".format(node_id[0], node_id[1]))
+      raise Exception("Duplicate cluster node id detected (host={0}, executor_id={1})".format(node_id[0], node_id[1]) +
+                      "Please ensure that:\n" +
+                      "1. Number of executors >= number of TensorFlow nodes\n" +
+                      "2. Number of tasks per executors is 1\n" +
+                      "3, TFCluster.shutdown() is successfully invoked when done.")
     else:
       tb_nodes.add(node_id)
 
 
@@ -21,6 +21,7 @@
 from six.moves.queue import Empty
 from . import marker
 
+
 def hdfs_path(ctx, path):
   """Convenience function to create a Tensorflow-compatible absolute HDFS path from relative paths
 
@@ -47,6 +48,7 @@ def hdfs_path(ctx, path):
       logging.warn("Unknown scheme {0} with relative path: {1}".format(ctx.defaultFS, path))
       return "{0}/{1}".format(ctx.defaultFS, path)
 
+
 def start_cluster_server(ctx, num_gpus=1, rdma=False):
   """Function that wraps the creation of TensorFlow ``tf.train.Server`` for a node in a distributed TensorFlow cluster.
 
@@ -71,7 +73,8 @@ def start_cluster_server(ctx, num_gpus=1, rdma=False):
   if tf.test.is_built_with_cuda():
     # GPU
     gpu_initialized = False
-    while not gpu_initialized:
+    retries = 3
+    while not gpu_initialized and retries > 0:
       try:
         # override PS jobs to only reserve one GPU
         if ctx.job_name == 'ps':
@@ -97,7 +100,10 @@ def start_cluster_server(ctx, num_gpus=1, rdma=False):
       except Exception as e:
         print(e)
         logging.error("{0}: Failed to allocate GPU, trying again...".format(ctx.worker_num))
+        retries -= 1
         time.sleep(10)
+    if not gpu_initialized:
+      raise Exception("Failed to allocate GPU")
   else:
     # CPU
     os.environ['CUDA_VISIBLE_DEVICES'] = ''
@@ -111,10 +117,12 @@ def start_cluster_server(ctx, num_gpus=1, rdma=False):
 
   return (cluster, server)
 
+
 def next_batch(mgr, batch_size, qname='input'):
   """*DEPRECATED*. Use TFNode.DataFeed class instead."""
   raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
 
+
 def export_saved_model(sess, export_dir, tag_set, signatures):
   """Convenience function to export a saved_model using provided arguments
 
@@ -148,25 +156,29 @@ def export_saved_model(sess, export_dir, tag_set, signatures):
   signature_def_map = {}
   for key, sig in signatures.items():
     signature_def_map[key] = tf.saved_model.signature_def_utils.build_signature_def(
-              inputs={ name:tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items() },
-              outputs={ name:tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items() },
-              method_name=sig['method_name'] if 'method_name' in sig else key)
+        inputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items()},
+        outputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items()},
+        method_name=sig['method_name'] if 'method_name' in sig else key)
   logging.info("===== signature_def_map: {}".format(signature_def_map))
-  builder.add_meta_graph_and_variables(sess,
-              tag_set.split(','),
-              signature_def_map=signature_def_map,
-              clear_devices=True)
+  builder.add_meta_graph_and_variables(
+      sess,
+      tag_set.split(','),
+      signature_def_map=signature_def_map,
+      clear_devices=True)
   g.finalize()
   builder.save()
 
+
 def batch_results(mgr, results, qname='output'):
   """*DEPRECATED*. Use TFNode.DataFeed class instead."""
   raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
 
+
 def terminate(mgr, qname='input'):
   """*DEPRECATED*. Use TFNode.DataFeed class instead."""
   raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
 
+
 class DataFeed(object):
   """This class manages the *InputMode.SPARK* data feeding process from the perspective of the TensorFlow application.
 
@@ -184,7 +196,7 @@ def __init__(self, mgr, train_mode=True, qname_in='input', qname_out='output', i
     self.qname_in = qname_in
     self.qname_out = qname_out
     self.done_feeding = False
-    self.input_tensors = [ tensor for col, tensor in sorted(input_mapping.items()) ] if input_mapping is not None else None
+    self.input_tensors = [tensor for col, tensor in sorted(input_mapping.items())] if input_mapping is not None else None
 
   def next_batch(self, batch_size):
     """Gets a batch of items from the input RDD.
@@ -206,7 +218,7 @@ def next_batch(self, batch_size):
     """
     logging.debug("next_batch() invoked")
     queue = self.mgr.get_queue(self.qname_in)
-    tensors = [] if self.input_tensors is None else { tensor:[] for tensor in self.input_tensors }
+    tensors = [] if self.input_tensors is None else {tensor: [] for tensor in self.input_tensors}
     count = 0
     while count < batch_size:
       item = queue.get(block=True)
@@ -276,4 +288,3 @@ def terminate(self):
       except Empty:
         logging.info("dropped {0} items from queue".format(count))
         done = True
-