sync w/ internal

leewyang · leewyang · commit 313f5cf46f53 · 2018-03-13T13:44:23.000-07:00
diff --git a/examples/cifar10/README.md b/examples/cifar10/README.md
@@ -18,7 +18,6 @@ Also, you will need to download the CIFAR-10 dataset per the [original example](
     # set environment variables (if not already done)
     export PYTHON_ROOT=~/Python
     export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python
-    export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python"
     export PATH=${PYTHON_ROOT}/bin/:$PATH
     export QUEUE=gpu
     export CIFAR10_DATA=<HDFS path to your downloaded files>
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
@@ -19,7 +19,6 @@ Also, you will need to [download the Imagenet dataset per the original example](
     # set environment variables (if not already done)
     export PYTHON_ROOT=~/Python
     export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python
-    export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python"
     export PATH=${PYTHON_ROOT}/bin/:$PATH
     export QUEUE=gpu
     export IMAGENET_DATA=<HDFS path to your downloaded files>
diff --git a/examples/imagenet/inception/imagenet_distributed_train_pipeline.py b/examples/imagenet/inception/imagenet_distributed_train_pipeline.py
@@ -13,7 +13,7 @@
 from tensorflowonspark.pipeline import TFEstimator
 from datetime import datetime
 
-import inception_export
+from inception import inception_export
 
 import sys
 import tensorflow as tf
diff --git a/examples/imagenet/inception/inception_export.py b/examples/imagenet/inception/inception_export.py
@@ -27,7 +27,7 @@
 tf.app.flags.DEFINE_string('subset', 'validation',
                            """Either 'validation' or 'train'.""")
 
-def export(args):
+def export(_):
   FLAGS = tf.app.flags.FLAGS
 
   """Evaluate model on Dataset for a number of steps."""
@@ -99,7 +99,7 @@ def preprocess_image(image_buffer):
     print('Successfully loaded model from %s at step=%s.' %
           (ckpt.model_checkpoint_path, global_step))
 
-    print("Exporting saved_model to: {}".format(args.export_dir))
+    print("Exporting saved_model to: {}".format(FLAGS.export_dir))
     # exported signatures defined in code
     signatures = {
       tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
@@ -109,7 +109,7 @@ def preprocess_image(image_buffer):
       }
     }
     TFNode.export_saved_model(sess,
-                              args.export_dir,
+                              FLAGS.export_dir,
                               tf.saved_model.tag_constants.SERVING,
                               signatures)
     print("Exported saved_model")
diff --git a/examples/slim/README.md b/examples/slim/README.md
@@ -17,7 +17,6 @@ And, you will need to [download an image dataset](https://github.com/tensorflow/
     # set environment variables (if not already done)
     export PYTHON_ROOT=~/Python
     export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python
-    export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python"
     export PATH=${PYTHON_ROOT}/bin/:$PATH
     export QUEUE=gpu
     export DATASET_DIR=<HDFS path to your downloaded files>
@@ -63,7 +62,6 @@ And, you will need to [download an image dataset](https://github.com/tensorflow/
     --conf spark.dynamicAllocation.enabled=false \
     --conf spark.yarn.maxAppAttempts=1 \
     --conf spark.ui.view.acls=* \
-    --conf spark.task.maxFailures=1 \
     --archives hdfs:///user/${USER}/Python.zip#Python \
     --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \
     --driver-library-path="/usr/local/cuda-7.5/lib64" \
diff --git a/tensorflowonspark/TFCluster.py b/tensorflowonspark/TFCluster.py
@@ -25,13 +25,17 @@
 import logging
 import os
 import random
+import sys
 import threading
 import time
 from pyspark.streaming import DStream
 from . import reservation
 from . import TFManager
 from . import TFSparkNode
 
+# status of TF background job
+tf_status = {}
+
 class InputMode(object):
   """Enum for the input modes of data feeding."""
   TENSORFLOW = 0                #: TensorFlow application is responsible for reading any data.
@@ -158,8 +162,15 @@ def shutdown(self, ssc=None):
       workerRDD = self.sc.parallelize(range(workers), workers)
       workerRDD.foreachPartition(TFSparkNode.shutdown(self.cluster_info, self.queues))
 
+    # exit Spark application w/ err status if TF job had any errors
+    if 'error' in tf_status:
+      logging.error("Exiting Spark application with error status.")
+      self.sc.cancelAllJobs()
+      self.sc.stop()
+      sys.exit(1)
+
     logging.info("Shutting down cluster")
-    # shutdown queues and manageres for "PS" executors.
+    # shutdown queues and managers for "PS" executors.
     # note: we have to connect/shutdown from the spark driver, because these executors are "busy" and won't accept any other tasks.
     for node in ps_list:
       addr = node['addr']
@@ -187,7 +198,7 @@ def tensorboard_url(self):
     return tb_url
 
 def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=InputMode.TENSORFLOW,
-        log_dir=None, driver_ps_nodes=False, queues=['input', 'output']):
+        log_dir=None, driver_ps_nodes=False, reservation_timeout=600, queues=['input', 'output', 'error']):
   """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors
 
   Args:
@@ -200,6 +211,7 @@ def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mo
     :input_mode: TFCluster.InputMode
     :log_dir: directory to save tensorboard event logs.  If None, defaults to a fixed path on local filesystem.
     :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
+    :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default)
     :queues: *INTERNAL_USE*
 
   Returns:
@@ -261,20 +273,28 @@ def _start_ps(node_index):
       ps_thread.start()
 
   # start TF on a background thread (on Spark driver) to allow for feeding job
-  def _start():
-    nodeRDD.foreachPartition(TFSparkNode.run(map_fun,
-                                              tf_args,
-                                              cluster_meta,
-                                              tensorboard,
-                                              log_dir,
-                                              queues,
-                                              background=(input_mode == InputMode.SPARK)))
-  t = threading.Thread(target=_start)
+  def _start(status):
+    try:
+      nodeRDD.foreachPartition(TFSparkNode.run(map_fun,
+                                                tf_args,
+                                                cluster_meta,
+                                                tensorboard,
+                                                log_dir,
+                                                queues,
+                                                background=(input_mode == InputMode.SPARK)))
+    except Exception as e:
+      logging.error("Exception in TF background thread")
+      status['error'] = str(e)
+
+  t = threading.Thread(target=_start, args=(tf_status,))
+  # run as daemon thread so that in spark mode main thread can exit
+  # if feeder spark stage fails and main thread can't do explicit shutdown
+  t.daemon = True
   t.start()
 
   # wait for executors to register and start TFNodes before continuing
   logging.info("Waiting for TFSparkNodes to start")
-  cluster_info = server.await_reservations()
+  cluster_info = server.await_reservations(sc, tf_status, reservation_timeout)
   logging.info("All TFSparkNodes started")
 
   # print cluster_info and extract TensorBoard URL
diff --git a/tensorflowonspark/TFManager.py b/tensorflowonspark/TFManager.py
@@ -26,6 +26,12 @@ def _get(key):
 def _set(key, value):
   kdict[key] = value
 
+def _get_queue(qname):
+  try:
+    return qdict[qname]
+  except KeyError:
+    return None
+
 def start(authkey, queues, mode='local'):
   """Create a new multiprocess.Manager (or return existing one).
 
@@ -42,7 +48,8 @@ def start(authkey, queues, mode='local'):
   kdict.clear()
   for q in queues:
     qdict[q] = JoinableQueue()
-  TFManager.register('get_queue', callable=lambda qname: qdict[qname])
+
+  TFManager.register('get_queue', callable=lambda qname: _get_queue(qname))
   TFManager.register('get', callable=lambda key: _get(key))
   TFManager.register('set', callable=lambda key, value: _set(key, value))
   if mode == 'remote':
diff --git a/tensorflowonspark/TFSparkNode.py b/tensorflowonspark/TFSparkNode.py
@@ -9,18 +9,21 @@
 from __future__ import print_function
 
 import logging
+import multiprocessing
 import os
-import sys
 import platform
 import socket
 import subprocess
-import multiprocessing
+import sys
 import uuid
+import time
+import traceback
+from threading import Thread
 
 from . import TFManager
 from . import TFNode
-from . import reservation
 from . import marker
+from . import reservation
 from . import util
 
 class TFNodeContext:
@@ -97,6 +100,14 @@ def _get_manager(cluster_info, host, ppid):
       authkey = node['authkey']
       TFSparkNode.mgr = TFManager.connect(addr,authkey)
       break
+
+  if TFSparkNode.mgr is None:
+    msg = "No TFManager found on this node, please ensure that:\n" + \
+          "1. Spark num_executors matches TensorFlow cluster_size\n" + \
+          "2. Spark cores/tasks per executor is 1.\n" + \
+          "3. Spark dynamic allocation is disabled."
+    raise Exception(msg)
+
   logging.info("Connected to TFSparkNode.mgr on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get('state'))))
   return TFSparkNode.mgr
 
@@ -152,7 +163,7 @@ def _mapfn(iter):
     addr = None
     if job_name == 'ps':
       # PS nodes must be remotely accessible in order to shutdown from Spark driver.
-      TFSparkNode.mgr = TFManager.start(authkey, ['control'], 'remote')
+      TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote')
       addr = (host, TFSparkNode.mgr.address[1])
     else:
       # worker nodes only need to be locally accessible within the executor for data feeding
@@ -238,7 +249,11 @@ def _mapfn(iter):
     # construct a TensorFlow clusterspec from cluster_info
     sorted_cluster_info = sorted(cluster_info, key=lambda k: k['worker_num'])
     spec = {}
+    last_worker_num = -1
     for node in sorted_cluster_info:
+      if (node['worker_num'] == last_worker_num):
+        raise Exception("Duplicate worker/task in cluster_info")
+      last_worker_num = node['worker_num']
       logging.info("node: {0}".format(node))
       (njob, nhost, nport) = (node['job_name'], node['host'], node['port'])
       hosts = [] if njob not in spec else spec[njob]
@@ -268,20 +283,37 @@ def wrapper_fn(args, context):
         sys.argv = args
       fn(args, context)
 
+    def wrapper_fn_background(args, context):
+      """Wrapper function that signals exceptions to foreground process."""
+      errq = TFSparkNode.mgr.get_queue('error')
+      try:
+        wrapper_fn(args, context)
+      except Exception:
+        errq.put(traceback.format_exc())
+        errq.join()
+
     if job_name == 'ps' or background:
       # invoke the TensorFlow main function in a background thread
       logging.info("Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process".format(
         job_name, task_index, job_name, worker_num))
-      p = multiprocessing.Process(target=wrapper_fn, args=(tf_args, ctx))
+
+      p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx))
       if job_name == 'ps':
         p.daemon = True
       p.start()
 
       # for ps nodes only, wait indefinitely in foreground thread for a "control" event (None == "stop")
       if job_name == 'ps':
         queue = TFSparkNode.mgr.get_queue('control')
+        equeue = TFSparkNode.mgr.get_queue('error')
         done = False
         while not done:
+          while (queue.empty() and equeue.empty()):
+            time.sleep(1)
+          if (not equeue.empty()):
+            e_str = equeue.get()
+            equeue.task_done()
+            raise Exception("exception in ps:\n" + e_str)
           msg = queue.get(block=True)
           logging.info("Got msg: {0}".format(msg))
           if msg is None:
@@ -311,7 +343,13 @@ def train(cluster_info, cluster_meta, qname='input'):
   def _train(iter):
     # get shared queue, reconnecting if necessary
     mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid())
-    queue = mgr.get_queue(qname)
+    try:
+      queue = mgr.get_queue(qname)
+      equeue = mgr.get_queue('error')
+    except (AttributeError, KeyError):
+      msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname)
+      raise Exception(msg)
+
     state = str(mgr.get('state'))
     logging.info("mgr.state={0}".format(state))
     terminating = state == "'terminating'"
@@ -321,15 +359,23 @@ def _train(iter):
       for item in iter:
         count += 1
       logging.info("Skipped {0} items from partition".format(count))
-
     else:
       logging.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue))
       count = 0
       for item in iter:
         count += 1
         queue.put(item, block=True)
+
       # wait for consumers to finish processing all items in queue before "finishing" this iterator
-      queue.join()
+      joinThr = Thread(target=queue.join)
+      joinThr.start()
+      while (joinThr.isAlive()):
+        if (not equeue.empty()):
+          e_str = equeue.get()
+          equeue.task_done()
+          raise Exception("exception in worker:\n" + e_str)
+        time.sleep(1)
+#      queue.join()
       logging.info("Processed {0} items in partition".format(count))
 
     # check if TF is terminating feed after this partition
@@ -361,7 +407,12 @@ def inference(cluster_info, qname='input'):
   def _inference(iter):
     # get shared queue, reconnecting if necessary
     mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid())
-    queue_in = mgr.get_queue(qname)
+    try:
+      queue_in = mgr.get_queue(qname)
+      equeue = mgr.get_queue('error')
+    except (AttributeError, KeyError):
+      msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname)
+      raise Exception(msg)
 
     logging.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue_in))
     count = 0
@@ -377,7 +428,15 @@ def _inference(iter):
       return []
 
     # wait for consumers to finish processing all items in queue before "finishing" this iterator
-    queue_in.join()
+    joinThr = Thread(target=queue_in.join)
+    joinThr.start()
+    while (joinThr.isAlive()):
+      if (not equeue.empty()):
+        e_str = equeue.get()
+        equeue.task_done()
+        raise Exception("exception in worker:\n" + e_str)
+      time.sleep(1)
+
     logging.info("Processed {0} items in partition".format(count))
 
     # read result queue
@@ -422,9 +481,13 @@ def _shutdown(iter):
     # terminate any listening queues
     logging.info("Stopping all queues")
     for q in queues:
-      queue = mgr.get_queue(q)
-      logging.info("Feeding None into {0} queue".format(q))
-      queue.put(None, block=True)
+      try:
+        queue = mgr.get_queue(q)
+        logging.info("Feeding None into {0} queue".format(q))
+        queue.put(None, block=True)
+      except (AttributeError, KeyError):
+        msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(q)
+        raise Exception(msg)
 
     logging.info("Setting mgr.state to 'stopped'")
     mgr.set('state', 'stopped')
diff --git a/tensorflowonspark/pipeline.py b/tensorflowonspark/pipeline.py
diff --git a/tensorflowonspark/reservation.py b/tensorflowonspark/reservation.py