add test-horovod.py

hephaex · hephaex · commit c4577ac0c935 · 2020-08-31T00:03:18.000+09:00
diff --git a/tests/test-horovod.py b/tests/test-horovod.py
@@ -0,0 +1,137 @@
+import argparse
+
+import keras
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+from keras.preprocessing.image import ImageDataGenerator
+from keras import backend as K
+import tensorflow as tf
+import horovod.keras as hvd
+
+# Training settings
+parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+parser.add_argument('--batch-size', type=int, default=128, metavar='N',
+                    help='input batch size for training (default: 128)')
+parser.add_argument('--epochs', type=int, default=24, metavar='N',
+                    help='number of epochs to train (default: 24)')
+parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                    help='learning rate (default: 1.0)')
+
+args = parser.parse_args()
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+config.gpu_options.visible_device_list = str(hvd.local_rank())
+K.set_session(tf.Session(config=config))
+
+batch_size = args.batch_size
+num_classes = 10
+
+# Enough epochs to demonstrate learning rate warmup and the reduction of
+# learning rate when training plateaues.
+epochs = args.epochs
+
+# Input image dimensions
+img_rows, img_cols = 28, 28
+
+# The data, shuffled and split between train and test sets
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+# Determine how many batches are there in train and test sets
+train_batches = len(x_train) // batch_size
+test_batches = len(x_test) // batch_size
+
+if K.image_data_format() == 'channels_first':
+    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+    input_shape = (1, img_rows, img_cols)
+else:
+    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
+    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
+    input_shape = (img_rows, img_cols, 1)
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
+
+# Convert class vectors to binary class matrices
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+model = Sequential()
+model.add(Conv2D(32, kernel_size=(3, 3),
+                 activation='relu',
+                 input_shape=input_shape))
+model.add(Conv2D(64, (3, 3), activation='relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+model.add(Flatten())
+model.add(Dense(128, activation='relu'))
+model.add(Dropout(0.5))
+model.add(Dense(num_classes, activation='softmax'))
+
+# Horovod: adjust learning rate based on number of GPUs.
+scaled_lr = args.lr * hvd.size()
+opt = keras.optimizers.Adadelta(lr=scaled_lr)
+
+# Horovod: add Horovod Distributed Optimizer.
+opt = hvd.DistributedOptimizer(opt)
+
+model.compile(loss=keras.losses.categorical_crossentropy,
+              optimizer=opt,
+              metrics=['accuracy'])
+
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+    # Horovod: average metrics among workers at the end of every epoch.
+    #
+    # Note: This callback must be in the list before the ReduceLROnPlateau,
+    # TensorBoard or other metrics-based callbacks.
+    hvd.callbacks.MetricAverageCallback(),
+
+    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=1),
+
+    # Reduce the learning rate if training plateaues.
+    keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
+]
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+	    callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
+
+# Set up ImageDataGenerators to do data augmentation for the training images.
+train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3,
+                               height_shift_range=0.08, zoom_range=0.08)
+test_gen = ImageDataGenerator()
+
+# Train the model.
+# Horovod: the training will randomly sample 1 / N batches of training data and
+# 3 / N batches of validation data on every worker, where N is the number of workers.
+# Over-sampling of validation data helps to increase probability that every validation
+# example will be evaluated.
+model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size),
+                    steps_per_epoch=train_batches // hvd.size(),
+                    callbacks=callbacks,
+                    epochs=epochs,
+                    verbose=1,
+                    validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size),
+                    validation_steps=3 * test_batches // hvd.size())
+
+# Evaluate the model on the full data set.
+score = model.evaluate(x_test, y_test, verbose=0)
+print('Test loss:', score[0])
+print('Test accuracy:', score[1])