Skip to content

Commit 0295f46

Browse files
leewyangtmielika
authored andcommitted
Update examples to TF 1.11 APIs (#361)
* update dataset examples to latest APIs * replace old examples with newer variants * update imports * simplify code * remove examples/slim * only request stop on initial terminating state * add validation dataset to avoid new error * update mnist/keras examples
1 parent 7df1215 commit 0295f46

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+209
-13871
lines changed

examples/mnist/keras/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ In this mode, Spark will distribute the MNIST dataset (as CSV) across the worker
6565
--conf spark.task.cpus=${CORES_PER_WORKER} \
6666
--conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \
6767
${TFoS_HOME}/examples/mnist/keras/mnist_mlp.py \
68-
--cluster_size 3 \
68+
--cluster_size ${SPARK_WORKER_INSTANCES} \
6969
--input_mode spark \
7070
--images ${TFoS_HOME}/mnist/csv/train/images \
7171
--labels ${TFoS_HOME}/mnist/csv/train/labels \

examples/mnist/keras/mnist_mlp.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def main_fun(args, ctx):
1313
import tensorflow as tf
1414
from tensorflow.python import keras
1515
from tensorflow.python.keras import backend as K
16+
from tensorflow.python.keras.datasets import mnist
1617
from tensorflow.python.keras.models import Sequential, load_model, save_model
1718
from tensorflow.python.keras.layers import Dense, Dropout
1819
from tensorflow.python.keras.optimizers import RMSprop
@@ -51,7 +52,6 @@ def generate_rdd_data(tf_feed, batch_size):
5152

5253
# the data, shuffled and split between train and test sets
5354
if args.input_mode == 'tf':
54-
from tensorflow.python.keras.datasets import mnist
5555
(x_train, y_train), (x_test, y_test) = mnist.load_data()
5656
x_train = x_train.reshape(60000, 784)
5757
x_test = x_test.reshape(10000, 784)
@@ -64,6 +64,9 @@ def generate_rdd_data(tf_feed, batch_size):
6464
else: # args.mode == 'spark'
6565
x_train = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x_train")
6666
y_train = tf.placeholder(tf.float32, [None, 10], name="y_train")
67+
(_, _), (x_test, y_test) = mnist.load_data()
68+
x_test = x_test.reshape(10000, 784)
69+
y_test = keras.utils.to_categorical(y_test, num_classes)
6770

6871
model = Sequential()
6972
model.add(Dense(512, activation='relu', input_shape=(784,)))
@@ -109,6 +112,7 @@ def save_checkpoint(epoch, logs=None):
109112
steps_per_epoch=args.steps_per_epoch,
110113
epochs=args.epochs,
111114
verbose=1,
115+
validation_data=(x_test, y_test),
112116
callbacks=callbacks)
113117

114118
if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0:
@@ -147,7 +151,7 @@ def save_checkpoint(epoch, logs=None):
147151
parser = argparse.ArgumentParser()
148152
parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
149153
parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=20)
150-
parser.add_argument("--export_dir", help="directory to export saved_mode")
154+
parser.add_argument("--export_dir", help="directory to export saved_model")
151155
parser.add_argument("--images", help="HDFS path to MNIST images in parallelized CSV format")
152156
parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf")
153157
parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format")

examples/mnist/keras/mnist_mlp_estimator.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from tensorflow.python import keras
44
from tensorflow.python.keras.models import Sequential
55
from tensorflow.python.keras.layers import Dense, Dropout
6-
from tensorflow.python.keras.optimizers import RMSprop
76
from tensorflowonspark import TFNode
87

98

@@ -31,7 +30,7 @@ def main_fun(args, ctx):
3130
model.add(Dropout(0.2))
3231
model.add(Dense(10, activation='softmax'))
3332
model.compile(loss='categorical_crossentropy',
34-
optimizer=RMSprop(),
33+
optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
3534
metrics=['accuracy'])
3635
model.summary()
3736

@@ -43,7 +42,7 @@ def main_fun(args, ctx):
4342
if args.input_mode == 'tf':
4443
# For InputMode.TENSORFLOW, just use data in memory
4544
train_input_fn = tf.estimator.inputs.numpy_input_fn(
46-
x={"dense_1_input": x_train},
45+
x={"dense_input": x_train},
4746
y=y_train,
4847
batch_size=128,
4948
num_epochs=None,
@@ -70,28 +69,17 @@ def train_input_fn():
7069

7170
# eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time
7271
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
73-
x={"dense_1_input": x_test},
72+
x={"dense_input": x_test},
7473
y=y_test,
7574
num_epochs=args.epochs,
7675
shuffle=False)
7776

78-
# serving_input_receiver_fn ALWAYS expects serialized TFExamples in a placeholder.
79-
def serving_input_receiver_fn():
80-
"""An input receiver that expects a serialized tf.Example."""
81-
serialized_tf_example = tf.placeholder(dtype=tf.string,
82-
shape=[args.batch_size],
83-
name='input_example_tensor')
84-
receiver_tensors = {'dense_1_input': serialized_tf_example}
85-
feature_spec = {'dense_1_input': tf.FixedLenFeature(784, tf.string)}
86-
features = tf.parse_example(serialized_tf_example, feature_spec)
87-
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
88-
8977
# setup tf.estimator.train_and_evaluate() w/ FinalExporter
90-
exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn)
78+
feature_spec = {'dense_input': tf.FixedLenFeature(784, tf.float32)}
79+
exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec))
9180
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps)
9281
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, exporters=exporter)
9382
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
94-
9583
else: # mode == 'inference'
9684
if args.input_mode == 'spark':
9785
tf_feed = TFNode.DataFeed(ctx.mgr)
@@ -137,7 +125,7 @@ def predict_input_fn():
137125
parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf")
138126
parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format")
139127
parser.add_argument("--model_dir", help="directory to write model checkpoints")
140-
parser.add_argument("--mode", help="(train|inference")
128+
parser.add_argument("--mode", help="(train|inference)", default="train")
141129
parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions")
142130
parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1)
143131
parser.add_argument("--steps", help="max number of steps to train", type=int, default=2000)

examples/mnist/spark/mnist_dist.py

Lines changed: 85 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -1,178 +1,145 @@
1-
# Copyright 2018 Yahoo Inc.
1+
# Copyright 2017 Yahoo Inc.
22
# Licensed under the terms of the Apache 2.0 license.
33
# Please see LICENSE file in the project root for terms.
44

55
# Distributed MNIST on grid based on TensorFlow MNIST example
66

77
from __future__ import absolute_import
88
from __future__ import division
9-
from __future__ import nested_scopes
109
from __future__ import print_function
1110

12-
from datetime import datetime
13-
import tensorflow as tf
14-
from tensorflowonspark import TFNode
15-
1611

1712
def print_log(worker_num, arg):
1813
print("{0}: {1}".format(worker_num, arg))
1914

2015

21-
class ExportHook(tf.train.SessionRunHook):
22-
def __init__(self, export_dir, input_tensor, output_tensor):
23-
self.export_dir = export_dir
24-
self.input_tensor = input_tensor
25-
self.output_tensor = output_tensor
26-
27-
def end(self, session):
28-
print("{} ======= Exporting to: {}".format(datetime.now().isoformat(), self.export_dir))
29-
signatures = {
30-
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
31-
'inputs': {'image': self.input_tensor},
32-
'outputs': {'prediction': self.output_tensor},
33-
'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME
34-
}
35-
}
36-
TFNode.export_saved_model(session,
37-
self.export_dir,
38-
tf.saved_model.tag_constants.SERVING,
39-
signatures)
40-
print("{} ======= Done exporting".format(datetime.now().isoformat()))
41-
42-
4316
def map_fun(args, ctx):
17+
from datetime import datetime
4418
import math
4519
import numpy
20+
import tensorflow as tf
4621
import time
4722

4823
worker_num = ctx.worker_num
4924
job_name = ctx.job_name
5025
task_index = ctx.task_index
5126

52-
# Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
53-
if job_name == "ps":
54-
time.sleep((worker_num + 1) * 5)
55-
5627
# Parameters
5728
IMAGE_PIXELS = 28
5829
hidden_units = 128
59-
batch_size = args.batch_size
6030

6131
# Get TF cluster and server instances
6232
cluster, server = ctx.start_cluster_server(1, args.rdma)
6333

64-
def feed_dict(batch):
65-
# Convert from [(images, labels)] to two numpy arrays of the proper type
66-
images = []
67-
labels = []
68-
for item in batch:
69-
images.append(item[0])
70-
labels.append(item[1])
71-
xs = numpy.array(images)
72-
xs = xs.astype(numpy.float32)
73-
xs = xs / 255.0
74-
ys = numpy.array(labels)
75-
ys = ys.astype(numpy.uint8)
76-
return (xs, ys)
34+
# Create generator for Spark data feed
35+
tf_feed = ctx.get_data_feed(args.mode == 'train')
36+
37+
def rdd_generator():
38+
while not tf_feed.should_stop():
39+
batch = tf_feed.next_batch(1)
40+
if len(batch) == 0:
41+
return
42+
row = batch[0]
43+
image = numpy.array(row[0]).astype(numpy.float32) / 255.0
44+
label = numpy.array(row[1]).astype(numpy.int64)
45+
yield (image, label)
7746

7847
if job_name == "ps":
7948
server.join()
8049
elif job_name == "worker":
81-
8250
# Assigns ops to the local worker by default.
8351
with tf.device(tf.train.replica_device_setter(
8452
worker_device="/job:worker/task:%d" % task_index,
8553
cluster=cluster)):
8654

87-
# Placeholders or QueueRunner/Readers for input data
88-
with tf.name_scope('inputs'):
89-
x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
90-
y_ = tf.placeholder(tf.float32, [None, 10], name="y_")
91-
92-
x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
93-
tf.summary.image("x_img", x_img)
94-
95-
with tf.name_scope('layer'):
96-
# Variables of the hidden layer
97-
with tf.name_scope('hidden_layer'):
98-
hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w")
99-
hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
100-
tf.summary.histogram("hidden_weights", hid_w)
101-
hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
102-
hid = tf.nn.relu(hid_lin)
103-
104-
# Variables of the softmax layer
105-
with tf.name_scope('softmax_layer'):
106-
sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
107-
sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
108-
tf.summary.histogram("softmax_weights", sm_w)
109-
y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
55+
# Dataset for input data
56+
ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))).batch(args.batch_size)
57+
iterator = ds.make_one_shot_iterator()
58+
x, y_ = iterator.get_next()
11059

111-
global_step = tf.train.get_or_create_global_step()
60+
# Variables of the hidden layer
61+
hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
62+
stddev=1.0 / IMAGE_PIXELS), name="hid_w")
63+
hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
64+
tf.summary.histogram("hidden_weights", hid_w)
65+
66+
# Variables of the softmax layer
67+
sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
68+
stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
69+
sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
70+
tf.summary.histogram("softmax_weights", sm_w)
71+
72+
x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
73+
tf.summary.image("x_img", x_img)
74+
75+
hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
76+
hid = tf.nn.relu(hid_lin)
11277

113-
with tf.name_scope('loss'):
114-
loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
115-
tf.summary.scalar("loss", loss)
78+
y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
11679

117-
with tf.name_scope('train'):
118-
train_op = tf.train.AdagradOptimizer(0.01).minimize(loss, global_step=global_step)
80+
global_step = tf.train.get_or_create_global_step()
81+
82+
loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
83+
tf.summary.scalar("loss", loss)
84+
train_op = tf.train.AdagradOptimizer(0.01).minimize(
85+
loss, global_step=global_step)
11986

12087
# Test trained model
12188
label = tf.argmax(y_, 1, name="label")
12289
prediction = tf.argmax(y, 1, name="prediction")
12390
correct_prediction = tf.equal(prediction, label)
124-
12591
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
12692
tf.summary.scalar("acc", accuracy)
12793

94+
saver = tf.train.Saver()
12895
summary_op = tf.summary.merge_all()
96+
init_op = tf.global_variables_initializer()
12997

98+
# Create a "supervisor", which oversees the training process and stores model state into HDFS
13099
logdir = ctx.absolute_path(args.model)
131100
print("tensorflow model path: {0}".format(logdir))
101+
summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph())
132102

133-
if job_name == "worker" and task_index == 0:
134-
summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())
135-
136-
# The MonitoredTrainingSession takes care of session initialization, restoring from
137-
# a checkpoint, and closing when done or an error occurs
138103
with tf.train.MonitoredTrainingSession(master=server.target,
139-
is_chief=(task_index == 0),
140-
checkpoint_dir=logdir,
141-
save_checkpoint_secs=10,
142-
hooks=[tf.train.StopAtStepHook(last_step=args.steps)],
143-
chief_only_hooks=[ExportHook(ctx.absolute_path(args.export_dir), x, prediction)]) as mon_sess:
104+
is_chief=(task_index == 0),
105+
scaffold=tf.train.Scaffold(init_op=init_op, summary_op=summary_op, saver=saver),
106+
checkpoint_dir=logdir,
107+
hooks=[tf.train.StopAtStepHook(last_step=args.steps)]) as sess:
108+
print("{} session ready".format(datetime.now().isoformat()))
109+
110+
# Loop until the session shuts down or feed has no more data
144111
step = 0
145-
tf_feed = ctx.get_data_feed(args.mode == "train")
146-
while not mon_sess.should_stop() and not tf_feed.should_stop():
147-
# Run a training step asynchronously
112+
while not sess.should_stop() and not tf_feed.should_stop():
113+
# Run a training step asynchronously.
148114
# See `tf.train.SyncReplicasOptimizer` for additional details on how to
149115
# perform *synchronous* training.
150116

151-
# using feed_dict
152-
batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
153-
feed = {x: batch_xs, y_: batch_ys}
154-
155-
if len(batch_xs) > 0:
156-
if args.mode == "train":
157-
_, summary, step = mon_sess.run([train_op, summary_op, global_step], feed_dict=feed)
158-
# print accuracy and save model checkpoint to HDFS every 100 steps
159-
if (step % 100 == 0):
160-
print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, mon_sess.run(accuracy, {x: batch_xs, y_: batch_ys})))
161-
162-
if task_index == 0:
163-
summary_writer.add_summary(summary, step)
164-
else: # args.mode == "inference"
165-
labels, preds, acc = mon_sess.run([label, prediction, accuracy], feed_dict=feed)
166-
167-
results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l, p in zip(labels, preds)]
168-
tf_feed.batch_results(results)
169-
print("results: {0}, acc: {1}".format(results, acc))
170-
171-
if mon_sess.should_stop() or step >= args.steps:
172-
tf_feed.terminate()
173-
174-
# Ask for all the services to stop.
175-
print("{0} stopping MonitoredTrainingSession".format(datetime.now().isoformat()))
176-
177-
if job_name == "worker" and task_index == 0:
178-
summary_writer.close()
117+
if args.mode == "train":
118+
_, summary, step = sess.run([train_op, summary_op, global_step])
119+
if (step % 100 == 0):
120+
print("{} step: {} accuracy: {}".format(datetime.now().isoformat(), step, sess.run(accuracy)))
121+
if task_index == 0:
122+
summary_writer.add_summary(summary, step)
123+
else: # args.mode == "inference"
124+
labels, preds, acc = sess.run([label, prediction, accuracy])
125+
results = ["{} Label: {}, Prediction: {}".format(datetime.now().isoformat(), l, p) for l, p in zip(labels, preds)]
126+
tf_feed.batch_results(results)
127+
print("acc: {}".format(acc))
128+
129+
print("{} stopping MonitoredTrainingSession".format(datetime.now().isoformat()))
130+
131+
# WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745
132+
# wait for all other nodes to complete (via done files)
133+
done_dir = "{}/{}/done".format(ctx.absolute_path(args.model), args.mode)
134+
print("Writing done file to: {}".format(done_dir))
135+
tf.gfile.MakeDirs(done_dir)
136+
with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file:
137+
done_file.write("done")
138+
139+
for i in range(60):
140+
if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']):
141+
print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i))
142+
time.sleep(1)
143+
else:
144+
print("{} All nodes done".format(datetime.now().isoformat()))
145+
break

0 commit comments

Comments
 (0)