Merge pull request #327 from yahoo/leewyang_estimator_inference

tmielika · web-flow · commit 61f7bfb4da0b · 2018-08-28T17:31:38.000-07:00
Distributed inferencing via estimator.predict
diff --git a/examples/mnist/keras/mnist_mlp_estimator.py b/examples/mnist/keras/mnist_mlp_estimator.py
@@ -39,46 +39,43 @@ def main_fun(args, ctx):
   estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.model_dir)
 
   # setup train_input_fn for InputMode.TENSORFLOW or InputMode.SPARK
-  if args.input_mode == 'tf':
-    train_input_fn = tf.estimator.inputs.numpy_input_fn(
-        x={"dense_1_input": x_train},
-        y=y_train,
-        batch_size=128,
-        num_epochs=None,
-        shuffle=True)
-  else:  # 'spark'
-    tf_feed = TFNode.DataFeed(ctx.mgr)
-
-    def rdd_generator():
-      while not tf_feed.should_stop():
-        batch = tf_feed.next_batch(1)
-        if len(batch) > 0:
-          record = batch[0]
-          image = numpy.array(record[0]).astype(numpy.float32) / 255.0
-          label = numpy.array(record[1]).astype(numpy.float32)
-          yield (image, label)
-
-    def train_input_fn():
-      ds = tf.data.Dataset.from_generator(rdd_generator,
-                                          (tf.float32, tf.float32),
-                                          (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10])))
-      ds = ds.batch(args.batch_size)
-      return ds
-
-  # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"dense_1_input": x_test},
-      y=y_test,
-      num_epochs=args.epochs,
-      shuffle=False)
-
-  # setup tf.estimator.train_and_evaluate()
-  train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps)
-  eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
-  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-
-  # export a saved_model, if export_dir provided
-  if args.export_dir:
+  if args.mode == 'train':
+    if args.input_mode == 'tf':
+      # For InputMode.TENSORFLOW, just use data in memory
+      train_input_fn = tf.estimator.inputs.numpy_input_fn(
+          x={"dense_1_input": x_train},
+          y=y_train,
+          batch_size=128,
+          num_epochs=None,
+          shuffle=True)
+    else:  # 'spark'
+      # For InputMode.SPARK, read data from RDD
+      tf_feed = TFNode.DataFeed(ctx.mgr)
+
+      def rdd_generator():
+        while not tf_feed.should_stop():
+          batch = tf_feed.next_batch(1)
+          if len(batch) > 0:
+            record = batch[0]
+            image = numpy.array(record[0]).astype(numpy.float32) / 255.0
+            label = numpy.array(record[1]).astype(numpy.float32)
+            yield (image, label)
+
+      def train_input_fn():
+        ds = tf.data.Dataset.from_generator(rdd_generator,
+                                            (tf.float32, tf.float32),
+                                            (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10])))
+        ds = ds.batch(args.batch_size)
+        return ds
+
+    # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time
+    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+        x={"dense_1_input": x_test},
+        y=y_test,
+        num_epochs=args.epochs,
+        shuffle=False)
+
+    # serving_input_receiver_fn ALWAYS expects serialized TFExamples in a placeholder.
     def serving_input_receiver_fn():
       """An input receiver that expects a serialized tf.Example."""
       serialized_tf_example = tf.placeholder(dtype=tf.string,
@@ -89,7 +86,35 @@ def serving_input_receiver_fn():
       features = tf.parse_example(serialized_tf_example, feature_spec)
       return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
 
-    estimator.export_savedmodel(args.export_dir, serving_input_receiver_fn)
+    # setup tf.estimator.train_and_evaluate() w/ FinalExporter
+    exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn)
+    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps)
+    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, exporters=exporter)
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+
+  else:  # mode == 'inference'
+    if args.input_mode == 'spark':
+      tf_feed = TFNode.DataFeed(ctx.mgr)
+
+      def rdd_generator():
+        while not tf_feed.should_stop():
+          batch = tf_feed.next_batch(1)
+          if len(batch) > 0:
+            record = batch[0]
+            image = numpy.array(record[0]).astype(numpy.float32) / 255.0
+            label = numpy.array(record[1]).astype(numpy.float32)
+            yield (image, label)
+
+      def predict_input_fn():
+        ds = tf.data.Dataset.from_generator(rdd_generator,
+                                            (tf.float32, tf.float32),
+                                            (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10])))
+        ds = ds.batch(args.batch_size)
+        return ds
+
+      predictions = estimator.predict(predict_input_fn)
+      for result in predictions:
+        tf_feed.batch_results([result])
 
 
 if __name__ == '__main__':
@@ -112,6 +137,8 @@ def serving_input_receiver_fn():
   parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf")
   parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format")
   parser.add_argument("--model_dir", help="directory to write model checkpoints")
+  parser.add_argument("--mode", help="(train|inference")
+  parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions")
   parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1)
   parser.add_argument("--steps", help="max number of steps to train", type=int, default=2000)
   parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
@@ -120,14 +147,22 @@ def serving_input_receiver_fn():
   print("args:", args)
 
   if args.input_mode == 'tf':
-    # for TENSORFLOW mode, each node will load/train entire dataset in memory per original example
+    # for TENSORFLOW mode, each node will load/train/infer entire dataset in memory per original example
     cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='master')
     cluster.shutdown()
   else:  # 'spark'
     # for SPARK mode, just use CSV format as an example
     images = sc.textFile(args.images).map(lambda ln: [float(x) for x in ln.split(',')])
     labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
     dataRDD = images.zip(labels)
-    cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='master')
-    cluster.train(dataRDD, args.epochs)
-    cluster.shutdown()
+    if args.mode == 'train':
+      cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='master')
+      cluster.train(dataRDD, args.epochs)
+      cluster.shutdown()
+    else:
+      # Note: using "parallel" inferencing, not "cluster"
+      # each node loads the model and runs independently of others
+      cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, 0, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir)
+      resultRDD = cluster.inference(dataRDD)
+      resultRDD.saveAsTextFile(args.output)
+      cluster.shutdown()
diff --git a/tensorflowonspark/TFSparkNode.py b/tensorflowonspark/TFSparkNode.py
@@ -271,14 +271,17 @@ def _mapfn(iter):
       hosts.append("{0}:{1}".format(nhost, nport))
       spec[njob] = hosts
 
-    # update TF_CONFIG and reserve GPU for tf.estimator based code
-    # Note: this will execute but be ignored by non-tf.estimator code
-    tf_config = json.dumps({
-      'cluster': spec,
-      'task': {'type': job_name, 'index': task_index},
-      'environment': 'cloud'
-    })
-    os.environ['TF_CONFIG'] = tf_config
+    # update TF_CONFIG if cluster spec has a 'master' node (i.e. tf.estimator)
+    if 'master' in spec:
+      tf_config = json.dumps({
+        'cluster': spec,
+        'task': {'type': job_name, 'index': task_index},
+        'environment': 'cloud'
+      })
+      logging.info("export TF_CONFIG: {}".format(tf_config))
+      os.environ['TF_CONFIG'] = tf_config
+
+    # reserve GPU
     if tf.test.is_built_with_cuda():
       num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1
       gpus_to_use = gpu_info.get_gpus(num_gpus)