yahoo
diff --git a/‎examples/mnist/keras/README.md‎
Lines changed: 58 additions & 10 deletions b/‎examples/mnist/keras/README.md‎
Lines changed: 58 additions & 10 deletions
diff --git a/‎examples/mnist/keras/mnist_inference.py‎
Lines changed: 103 additions & 0 deletions b/‎examples/mnist/keras/mnist_inference.py‎
Lines changed: 103 additions & 0 deletions
@@ -2,11 +2,10 @@
 
 Original Source: https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py
 
-This is the MNIST Multi Layer Perceptron example from the [Keras examples](https://github.com/fchollet/keras/blob/master/examples), adapted for TensorFlowOnSpark.
+This is the MNIST Multi Layer Perceptron example from the [Keras examples](https://github.com/fchollet/keras/blob/master/examples), adapted for the `tf.estimator` API and TensorFlowOnSpark.
 
 Notes:
 - This example assumes that Spark, TensorFlow, and TensorFlowOnSpark are already installed.
-- Keras currently saves model checkpoints as [HDF5](https://support.hdfgroup.org/HDF5/) using the [h5py package](http://www.h5py.org/).  Unfortunately, this is not currently supported on HDFS.  Consequently, this example demonstrates how to save standard TensorFlow model checkpoints on HDFS via a Keras LambdaCallback.  If you don't need HDFS support, you can use the standard ModelCheckpoint instead.
 - InputMode.SPARK only supports feeding data from a single RDD, so the validation dataset/code is disabled in the corresponding example.
 
 #### Launch the Spark Standalone cluster
@@ -24,19 +23,18 @@ Notes:
 In this mode, each worker will load the entire MNIST dataset into memory (automatically downloading the dataset if needed).
 
     # remove any old artifacts
-    rm -rf ${TFoS_HOME}/mnist_model ${TFoS_HOME}/mnist_export
+    rm -rf ${TFoS_HOME}/mnist_model
 
     # train and validate
     ${SPARK_HOME}/bin/spark-submit \
     --master ${MASTER} \
     --conf spark.cores.max=${TOTAL_CORES} \
     --conf spark.task.cpus=${CORES_PER_WORKER} \
     --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \
-    ${TFoS_HOME}/examples/mnist/keras/mnist_mlp.py \
+    ${TFoS_HOME}/examples/mnist/keras/mnist_mlp_estimator.py \
     --cluster_size ${SPARK_WORKER_INSTANCES} \
     --input_mode tf \
     --model_dir ${TFoS_HOME}/mnist_model \
-    --export_dir ${TFoS_HOME}/mnist_export \
     --epochs 5 \
     --tensorboard
 
@@ -56,25 +54,75 @@ In this mode, Spark will distribute the MNIST dataset (as CSV) across the worker
     ls -lR ${TFoS_HOME}/mnist/csv
 
     # remove any old artifacts
-    rm -rf ${TFoS_HOME}/mnist_model ${TFoS_HOME}/mnist_export
+    rm -rf ${TFoS_HOME}/mnist_model
 
-    # train and validate
+    # train
     ${SPARK_HOME}/bin/spark-submit \
     --master ${MASTER} \
     --conf spark.cores.max=${TOTAL_CORES} \
     --conf spark.task.cpus=${CORES_PER_WORKER} \
     --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \
-    ${TFoS_HOME}/examples/mnist/keras/mnist_mlp.py \
+    ${TFoS_HOME}/examples/mnist/keras/mnist_mlp_estimator.py \
     --cluster_size ${SPARK_WORKER_INSTANCES} \
     --input_mode spark \
     --images ${TFoS_HOME}/mnist/csv/train/images \
     --labels ${TFoS_HOME}/mnist/csv/train/labels \
     --epochs 5 \
     --model_dir ${TFoS_HOME}/mnist_model \
-    --export_dir ${TFoS_HOME}/mnist_export \
     --tensorboard
 
+#### Inference via saved_model_cli
+
+The training code will automatically export a TensorFlow SavedModel, which can be used with the `saved_model_cli` from the command line, as follows:
+
+    # path to the SavedModel export
+    export SAVED_MODEL=${TFoS_HOME}/mnist_model/export/serving/*
+
+    # use a CSV formatted test example
+    IMG=$(head -n 1 $TFoS_HOME/examples/mnist/csv/test/images/part-00000)
+
+    # introspect model
+    saved_model_cli show --dir $SAVED_MODEL --all
+
+    # inference via saved_model_cli
+    saved_model_cli run --dir $SAVED_MODEL --tag_set serve --signature_def serving_default --input_exp "dense_input=[[$IMG]]"
+
+#### Inference via TF-Serving
+
+For online inferencing use cases, you can serve the SavedModel via a TensorFlow Serving instance as follows.  Note that TF-Serving provides both GRPC and REST APIs, but we will only
+demonstrate the use of the REST API.  Also, [per the TensorFlow Serving instructions](https://www.tensorflow.org/serving/), we will run the serving instance inside a Docker container.
+
+    # Start the TF-Serving instance in a docker container
+    docker pull tensorflow/serving
+    docker run -t --rm -p 8501:8501 -v "${TFoS_HOME}/mnist_model/export/serving:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving &
+
+    # GET model status
+    curl http://localhost:8501/v1/models/mnist
+
+    # GET model metadata
+    curl http://localhost:8501/v1/models/mnist/metadata
+
+    # POST example for inferencing
+    curl -v -d "{\"instances\": [ {\"dense_input\": [$IMG] } ]}" -X POST http://localhost:8501/v1/models/mnist:predict
+
+    # Stop the TF-Serving container
+    docker stop $(docker ps -q)
+
+#### Run Parallel Inferencing via Spark
+
+For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors).  Each executor/instance will operate independently on a shard of the dataset.  Note that this requires that the model fits in the memory of each executor.
+
+    # remove any old artifacts
+    rm -Rf ${TFoS_HOME}/predictions
+
+    # inference
+    ${SPARK_HOME}/bin/spark-submit \
+    --master $MASTER ${TFoS_HOME}/examples/mnist/keras/mnist_inference.py \
+    --cluster_size 3 \
+    --images_labels ${TFoS_HOME}/mnist/tfr/test \
+    --export ${TFoS_HOME}/mnist_model/export/serving/* \
+    --output ${TFoS_HOME}/predictions
+
 #### Shutdown the Spark Standalone cluster
 
     ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh
-
 
@@ -0,0 +1,103 @@
+# Copyright 2018 Yahoo Inc.
+# Licensed under the terms of the Apache 2.0 license.
+# Please see LICENSE file in the project root for terms.
+
+# This example demonstrates how to leverage Spark for parallel inferencing from a SavedModel.
+#
+# Normally, you can use TensorFlowOnSpark to just form a TensorFlow cluster for training and inferencing.
+# However, in some situations, you may have a SavedModel without the original code for defining the inferencing
+# graph.  In these situations, we can use Spark to instantiate a single-node TensorFlow instance on each executor,
+# where each executor can independently load the model and inference on input data.
+#
+# Note: this particular example demonstrates use of `tf.data.Dataset` to read the input data for inferencing, 
+# but it could also be adapted to just use an RDD of TFRecords from Spark.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import numpy as np
+import tensorflow as tf
+
+IMAGE_PIXELS = 28
+
+
+def inference(it, num_workers, args):
+  from tensorflowonspark import util
+
+  # consume worker number from RDD partition iterator
+  for i in it:
+    worker_num = i
+  print("worker_num: {}".format(i))
+
+  # setup env for single-node TF
+  util.single_node_env()
+
+  # load saved_model using default tag and signature
+  sess = tf.Session()
+  tf.saved_model.loader.load(sess, ['serve'], args.export)
+
+  # parse function for TFRecords
+  def parse_tfr(example_proto):
+    feature_def = {"label": tf.FixedLenFeature(10, tf.int64),
+                   "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)}
+    features = tf.parse_single_example(example_proto, feature_def)
+    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
+    image = tf.div(tf.to_float(features['image']), norm)
+    label = tf.to_float(features['label'])
+    return (image, label)
+
+  # define a new tf.data.Dataset (for inferencing)
+  ds = tf.data.Dataset.list_files("{}/part-*".format(args.images_labels))
+  ds = ds.shard(num_workers, worker_num)
+  ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=1)
+  ds = ds.map(parse_tfr).batch(10)
+  iterator = ds.make_one_shot_iterator()
+  image_label = iterator.get_next(name='inf_image')
+
+  # create an output file per spark worker for the predictions
+  tf.gfile.MakeDirs(args.output)
+  output_file = tf.gfile.GFile("{}/part-{:05d}".format(args.output, worker_num), mode='w')
+
+  while True:
+    try:
+      # get images and labels from tf.data.Dataset
+      img, lbl = sess.run(['inf_image:0', 'inf_image:1'])
+
+      # inference by feeding these images and labels into the input tensors
+      # you can view the exported model signatures via:
+      #     saved_model_cli show --dir <export_dir> --all
+
+      # note that we feed directly into the graph tensors (bypassing the exported signatures)
+      # these tensors will be shown in the "name" field of the signature definitions
+
+      outputs = sess.run(['dense_2/Softmax:0'], feed_dict={'Placeholder:0': img})
+      for p in outputs[0]:
+        output_file.write("{}\n".format(np.argmax(p)))
+    except tf.errors.OutOfRangeError:
+      break
+
+  output_file.close()
+
+
+if __name__ == '__main__':
+  from pyspark.context import SparkContext
+  from pyspark.conf import SparkConf
+
+  sc = SparkContext(conf=SparkConf().setAppName("mnist_inference"))
+  executors = sc._conf.get("spark.executor.instances")
+  num_executors = int(executors) if executors is not None else 1
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--cluster_size", help="number of nodes in the cluster (for S with labelspark Standalone)", type=int, default=num_executors)
+  parser.add_argument('--images_labels', type=str, help='Directory for input images with labels')
+  parser.add_argument("--export", help="HDFS path to export model", type=str, default="mnist_export")
+  parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions")
+  args, _ = parser.parse_known_args()
+  print("args: {}".format(args))
+
+  # Not using TFCluster... just running single-node TF instances on each executor
+  nodes = list(range(args.cluster_size))
+  nodeRDD = sc.parallelize(list(range(args.cluster_size)), args.cluster_size)
+  nodeRDD.foreachPartition(lambda worker_num: inference(worker_num, args.cluster_size, args))