add parallel inferencing code to mnist/keras example

leewyang · leewyang · commit 5fc70a47e8b1 · 2018-12-07T15:08:47.000-08:00
diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md
@@ -71,43 +71,58 @@ In this mode, Spark will distribute the MNIST dataset (as CSV) across the worker
     --model_dir ${TFoS_HOME}/mnist_model \
     --tensorboard
 
+#### Inference via saved_model_cli
 
-#### Shutdown the Spark Standalone cluster
+The training code will automatically export a TensorFlow SavedModel, which can be used with the `saved_model_cli` from the command line, as follows:
 
-    ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh
+    # path to the SavedModel export
+    export SAVED_MODEL=${TFoS_HOME}/mnist_model/export/serving/*
+
+    # use a CSV formatted test example
+    IMG=$(head -n 1 $TFoS_HOME/examples/mnist/csv/test/images/part-00000)
+
+    # introspect model
+    saved_model_cli show --dir $SAVED_MODEL --all
+
+    # inference via saved_model_cli
+    saved_model_cli run --dir $SAVED_MODEL --tag_set serve --signature_def serving_default --input_exp "dense_input=[[$IMG]]"
 
 #### Inference via TF-Serving
 
-The training code will automatically export a TensorFlow SavedModel, which can be used with TensorFlow Serving as follows.
+For online inferencing use cases, you can serve the SavedModel via a TensorFlow Serving instance as follows.  Note that TF-Serving provides both GRPC and REST APIs, but we will only
+demonstrate the use of the REST API.  Also, [per the TensorFlow Serving instructions](https://www.tensorflow.org/serving/), we will run the serving instance inside a Docker container.
 
-Note: we use Docker to run the TF-Serving instance, per [recommendation](https://www.tensorflow.org/serving/).
-```
-# path to the SavedModel export
-export MODEL=${TFoS_HOME}/mnist_model/export/serving/*
+    # Start the TF-Serving instance in a docker container
+    docker pull tensorflow/serving
+    docker run -t --rm -p 8501:8501 -v "${TFoS_HOME}/mnist_model/export/serving:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving &
 
-# use the CSV formatted data as a single example
-IMG=$(head -n 1 $TFoS_HOME/examples/mnist/csv/test/images/part-00000)
+    # GET model status
+    curl http://localhost:8501/v1/models/mnist
 
-# introspect model
-saved_model_cli show --dir $MODEL --all
+    # GET model metadata
+    curl http://localhost:8501/v1/models/mnist/metadata
 
-# inference via saved_model_cli
-saved_model_cli run --dir $MODEL --tag_set serve --signature_def serving_default --input_exp "dense_input=[[$IMG]]"
-# [[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]
+    # POST example for inferencing
+    curl -v -d "{\"instances\": [ {\"dense_input\": [$IMG] } ]}" -X POST http://localhost:8501/v1/models/mnist:predict
 
-# START the TF-Serving instance in a docker container
-docker pull tensorflow/serving
-docker run -t --rm -p 8501:8501 -v "${TFoS_HOME}/mnist_model/export/serving:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving &
+    # Stop the TF-Serving container
+    docker stop $(docker ps -q)
 
-# GET model status
-curl http://localhost:8501/v1/models/mnist
+#### Run Parallel Inferencing via Spark
 
-# GET model metadata
-curl http://localhost:8501/v1/models/mnist/metadata
+For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors).  Each executor/instance will operate independently on a shard of the dataset.  Note that this requires that the model fits in the memory of each executor.
 
-# POST example for inferencing
-curl -v -d "{\"instances\": [ {\"dense_input\": [$IMG] } ]}" -X POST http://localhost:8501/v1/models/mnist:predict
+    # remove any old artifacts
+    rm -Rf ${TFoS_HOME}/predictions
 
-# STOP the TF-Serving container
-docker stop $(docker ps -q)
-```
+    # inference
+    ${SPARK_HOME}/bin/spark-submit \
+    --master $MASTER ${TFoS_HOME}/examples/mnist/keras/mnist_inference.py \
+    --cluster_size 3 \
+    --images_labels ${TFoS_HOME}/mnist/tfr/test \
+    --export ${TFoS_HOME}/mnist_model/export/serving/* \
+    --output ${TFoS_HOME}/predictions
+
+#### Shutdown the Spark Standalone cluster
+
+    ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh
diff --git a/examples/mnist/keras/mnist_inference.py b/examples/mnist/keras/mnist_inference.py
@@ -0,0 +1,103 @@
+# Copyright 2018 Yahoo Inc.
+# Licensed under the terms of the Apache 2.0 license.
+# Please see LICENSE file in the project root for terms.
+
+# This example demonstrates how to leverage Spark for parallel inferencing from a SavedModel.
+#
+# Normally, you can use TensorFlowOnSpark to just form a TensorFlow cluster for training and inferencing.
+# However, in some situations, you may have a SavedModel without the original code for defining the inferencing
+# graph.  In these situations, we can use Spark to instantiate a single-node TensorFlow instance on each executor,
+# where each executor can independently load the model and inference on input data.
+#
+# Note: this particular example demonstrates use of `tf.data.Dataset` to read the input data for inferencing, 
+# but it could also be adapted to just use an RDD of TFRecords from Spark.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import numpy as np
+import tensorflow as tf
+
+IMAGE_PIXELS = 28
+
+
+def inference(it, num_workers, args):
+  from tensorflowonspark import util
+
+  # consume worker number from RDD partition iterator
+  for i in it:
+    worker_num = i
+  print("worker_num: {}".format(i))
+
+  # setup env for single-node TF
+  util.single_node_env()
+
+  # load saved_model using default tag and signature
+  sess = tf.Session()
+  tf.saved_model.loader.load(sess, ['serve'], args.export)
+
+  # parse function for TFRecords
+  def parse_tfr(example_proto):
+    feature_def = {"label": tf.FixedLenFeature(10, tf.int64),
+                   "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)}
+    features = tf.parse_single_example(example_proto, feature_def)
+    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
+    image = tf.div(tf.to_float(features['image']), norm)
+    label = tf.to_float(features['label'])
+    return (image, label)
+
+  # define a new tf.data.Dataset (for inferencing)
+  ds = tf.data.Dataset.list_files("{}/part-*".format(args.images_labels))
+  ds = ds.shard(num_workers, worker_num)
+  ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=1)
+  ds = ds.map(parse_tfr).batch(10)
+  iterator = ds.make_one_shot_iterator()
+  image_label = iterator.get_next(name='inf_image')
+
+  # create an output file per spark worker for the predictions
+  tf.gfile.MakeDirs(args.output)
+  output_file = tf.gfile.GFile("{}/part-{:05d}".format(args.output, worker_num), mode='w')
+
+  while True:
+    try:
+      # get images and labels from tf.data.Dataset
+      img, lbl = sess.run(['inf_image:0', 'inf_image:1'])
+
+      # inference by feeding these images and labels into the input tensors
+      # you can view the exported model signatures via:
+      #     saved_model_cli show --dir <export_dir> --all
+
+      # note that we feed directly into the graph tensors (bypassing the exported signatures)
+      # these tensors will be shown in the "name" field of the signature definitions
+
+      outputs = sess.run(['dense_2/Softmax:0'], feed_dict={'Placeholder:0': img})
+      for p in outputs[0]:
+        output_file.write("{}\n".format(np.argmax(p)))
+    except tf.errors.OutOfRangeError:
+      break
+
+  output_file.close()
+
+
+if __name__ == '__main__':
+  from pyspark.context import SparkContext
+  from pyspark.conf import SparkConf
+
+  sc = SparkContext(conf=SparkConf().setAppName("mnist_inference"))
+  executors = sc._conf.get("spark.executor.instances")
+  num_executors = int(executors) if executors is not None else 1
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--cluster_size", help="number of nodes in the cluster (for S with labelspark Standalone)", type=int, default=num_executors)
+  parser.add_argument('--images_labels', type=str, help='Directory for input images with labels')
+  parser.add_argument("--export", help="HDFS path to export model", type=str, default="mnist_export")
+  parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions")
+  args, _ = parser.parse_known_args()
+  print("args: {}".format(args))
+
+  # Not using TFCluster... just running single-node TF instances on each executor
+  nodes = list(range(args.cluster_size))
+  nodeRDD = sc.parallelize(list(range(args.cluster_size)), args.cluster_size)
+  nodeRDD.foreachPartition(lambda worker_num: inference(worker_num, args.cluster_size, args))
diff --git a/examples/mnist/keras/mnist_mlp_estimator.py b/examples/mnist/keras/mnist_mlp_estimator.py
@@ -107,7 +107,7 @@ def train_input_fn():
 
   # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745
   # wait for all other nodes to complete (via done files)
-  done_dir = "{}/{}/done".format(ctx.absolute_path(args.model_dir), args.mode)
+  done_dir = "{}/done".format(ctx.absolute_path(args.model_dir))
   print("Writing done file to: {}".format(done_dir))
   tf.gfile.MakeDirs(done_dir)
   with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file:
@@ -157,14 +157,6 @@ def train_input_fn():
     images = sc.textFile(args.images).map(lambda ln: [float(x) for x in ln.split(',')])
     labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
     dataRDD = images.zip(labels)
-    if args.mode == 'train':
-      cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='master')
-      cluster.train(dataRDD, args.epochs)
-      cluster.shutdown()
-    else:
-      # Note: using "parallel" inferencing, not "cluster"
-      # each node loads the model and runs independently of others
-      cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, 0, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir)
-      resultRDD = cluster.inference(dataRDD)
-      resultRDD.saveAsTextFile(args.output)
-      cluster.shutdown()
+    cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='master')
+    cluster.train(dataRDD, args.epochs)
+    cluster.shutdown()
diff --git a/examples/mnist/tf/mnist_inference.py b/examples/mnist/tf/mnist_inference.py
@@ -17,14 +17,11 @@
 from __future__ import print_function
 
 import argparse
-import logging
-import sys
 import tensorflow as tf
-import time
-import traceback
 
 IMAGE_PIXELS = 28
 
+
 def inference(it, num_workers, args):
   from tensorflowonspark import util
 
@@ -69,9 +66,10 @@ def parse_tfr(example_proto):
 
       # inference by feeding these images and labels into the input tensors
       # you can view the exported model signatures via:
-      #     saved_model_cli show --dir mnist_export --all
+      #     saved_model_cli show --dir <saved_model> --all
 
       # note that we feed directly into the graph tensors (bypassing the exported signatures)
+      # these tensors will be shown in the "name" field of the signature definitions
       # also note that we can feed/fetch tensors that were not explicitly exported, e.g. `y_` and `label:0`
 
       labels, preds = sess.run(['label:0', 'prediction:0'], feed_dict={'x:0': img, 'y_:0': lbl})
@@ -82,8 +80,8 @@ def parse_tfr(example_proto):
 
   output_file.close()
 
+
 if __name__ == '__main__':
-  import os
   from pyspark.context import SparkContext
   from pyspark.conf import SparkConf
 
@@ -103,4 +101,3 @@ def parse_tfr(example_proto):
   nodes = list(range(args.cluster_size))
   nodeRDD = sc.parallelize(list(range(args.cluster_size)), args.cluster_size)
   nodeRDD.foreachPartition(lambda worker_num: inference(worker_num, args.cluster_size, args))
-