Support customize training loop and dataset using TensorFlow 1.x (#2500)

workingloong · web-flow · commit 3d1ddcba779b · 2021-02-05T13:47:25.000+08:00
* Worker can report training data to the master if using recordio

* Add logs

* Create a TensorFlow operator

* Warp the job command using  parentheses

* Pre-commit

* Fix training loop

* Pre-commit

* Develop a TensorFlow V1 model

* Fix the bug if the host of a new worker is the same as an old worker

* Do not call on_pod_deleted if the status is from FAILED to DELETED

* Fix the counter when retring

* Fix backward_passes_per_step

* Pre-commit

* Pre-commit

* Delete unused logs
diff --git a/elasticai_api/common/data_shard_service.py b/elasticai_api/common/data_shard_service.py
@@ -73,7 +73,7 @@ def __init__(
         self._report_training_params()
 
     def _report_training_params(self):
-        if self._num_epochs and self._dataset_size:
+        if self._num_epochs and (self._dataset_size or self._training_data):
             self._mc.report_training_params(
                 batch_size=self._batch_size,
                 num_epochs=self._num_epochs,
diff --git a/elasticai_api/tensorflow/controller.py b/elasticai_api/tensorflow/controller.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 import time
+from distutils.version import LooseVersion
 
 import tensorflow as tf
 from tensorflow.python.framework.errors_impl import UnknownError
@@ -21,6 +22,8 @@
     RETRY_ALLREDUCE_INTERVAL_SECS,
     AllReduceController,
 )
+from elasticai_api.common.data_shard_service import RecordIndexService
+from elasticai_api.common.master_client import build_master_client
 from elasticai_api.util.log_utils import default_logger as logger
 
 try:
@@ -30,6 +33,50 @@
 except ImportError:
     hvd = None
 
+_IS_TF2 = LooseVersion(tf.__version__) >= LooseVersion("2.0.0")
+
+
+def create_elastic_controller(
+    batch_size,
+    num_epochs=None,
+    dataset_size=None,
+    shuffle=False,
+    training_data=None,
+):
+    """Create an elastic AllReduce controller with data shard service.
+    Users can use the `controller.data_shard_service` to get data
+    shards like:
+    ```python
+    shard = controller.data_shard_service.fetch_shard()
+    ```
+
+    Users also can use the controller to do an elastic training.
+
+    Args:
+        batch_size: The batch size of a single worker.
+        num_epochs: The number of epochs.
+        dataset_size: The total size of dataset.
+    """
+    master_client = build_master_client()
+    record_index_service = RecordIndexService(
+        master_client=master_client,
+        batch_size=batch_size,
+        num_epochs=num_epochs,
+        dataset_size=dataset_size,
+        shuffle=shuffle,
+        training_data=training_data,
+    )
+    if _IS_TF2:
+        controller = TensorFlowV2AllReduceController(
+            master_client, record_index_service
+        )
+    else:
+        controller = TensorFlowV1AllReduceController(
+            master_client, record_index_service
+        )
+    controller.init_horovod_locally()
+    return controller
+
 
 class TensorFlowV2AllReduceController(AllReduceController):
     """The controller is responsible for elastic training of
@@ -87,13 +134,18 @@ def __init__(self, master_client, master_addr):
             master_client, master_addr
         )
         self._bcast_op = None
+        self._session = None
 
-    def broadcast(self):
+    def set_broadcast_variables(self, variables):
         if self._bcast_op is None:
-            self._variables = tf.global_variables()
+            self._variables = variables
             self._bcast_op = broadcast_variables(self._variables, root_rank=0)
-        session = tf.get_default_session()
-        session.run(self._bcast_op)
+
+    def set_session(self, session):
+        self._session = session
+
+    def broadcast(self):
+        self._session.run(self._bcast_op)
 
     def train_one_batch_with_retries(self, func, *args, **kwargs):
         allreduce_success = False
diff --git a/model_zoo/mnist/mnist_train_tfv1.py b/model_zoo/mnist/mnist_train_tfv1.py
@@ -11,56 +11,147 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import horovod.tensorflow as hvd
+import argparse
+from contextlib import closing
+
+import recordio
 import tensorflow as tf
 
-from elasticdl.python.common.constants import Mode
+from elasticai_api.tensorflow.controller import create_elastic_controller
+from elasticai_api.tensorflow.optimizer import (
+    AdjustBackwardPassesPerStepHook,
+    DistributedOptimizer,
+)
 from elasticdl.python.common.log_utils import default_logger as logger
 
+layers = tf.layers
 
-def train(dataset, elastic_controller):
-    dataset_it = dataset.make_one_shot_iterator()
-    batch_x, batch_y = dataset_it.get_next()
-    batch_x = tf.cast(batch_x, tf.float32)
 
-    x = tf.keras.layers.Reshape((28, 28, 1))(batch_x)
-    x = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu")(x)
-    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu")(x)
-    x = tf.keras.layers.BatchNormalization()(x)
-    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
-    x = tf.keras.layers.Dropout(0.25)(x)
-    x = tf.keras.layers.Flatten()(x)
-    outputs = tf.keras.layers.Dense(10)(x)
-    loss = tf.reduce_mean(
-        input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits=outputs, labels=tf.reshape(batch_y, [-1])
+def get_dataset_gen(data_shard_service):
+    def gen():
+        while True:
+            shard = data_shard_service.fetch_shard()
+            if not shard:
+                raise StopIteration("No data")
+            with closing(
+                recordio.Scanner(
+                    shard.name, shard.start, shard.end - shard.start,
+                )
+            ) as reader:
+                for i in range(shard.start, shard.end):
+                    record = reader.record()
+                    if record:
+                        yield record
+
+    return gen
+
+
+def create_dataset(data_shard_service):
+    gen = get_dataset_gen(data_shard_service)
+    dataset = tf.data.Dataset.from_generator(gen, tf.string)
+    return dataset
+
+
+def conv_model(feature, target, mode):
+    """2-layer convolution model."""
+    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
+    # with a on-value of 1 for each one-hot vector of length 10.
+    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
+
+    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
+    # image width and height final dimension being the number of color
+    # channels.
+    feature = tf.reshape(feature, [-1, 28, 28, 1])
+
+    # First conv layer will compute 32 features for each 5x5 patch
+    with tf.variable_scope("conv_layer1"):
+        h_conv1 = layers.conv2d(
+            feature,
+            32,
+            kernel_size=[5, 5],
+            activation=tf.nn.relu,
+            padding="SAME",
+        )
+        h_pool1 = tf.nn.max_pool(
+            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME"
+        )
+
+    # Second conv layer will compute 64 features for each 5x5 patch.
+    with tf.variable_scope("conv_layer2"):
+        h_conv2 = layers.conv2d(
+            h_pool1,
+            64,
+            kernel_size=[5, 5],
+            activation=tf.nn.relu,
+            padding="SAME",
+        )
+        h_pool2 = tf.nn.max_pool(
+            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME"
         )
+        # reshape tensor into a batch of vectors
+        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
+
+    # Densely connected layer with 1024 neurons.
+    h_fc1 = layers.dropout(
+        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
+        rate=0.5,
+        training=mode == tf.estimator.ModeKeys.TRAIN,
     )
-    optimizer = tf.train.GradientDescentOptimizer(0.1)
-    optimizer = hvd.DistributedOptimizer(optimizer)
-    train_step = optimizer.minimize(loss)
 
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
+    # Compute logits (1 per class) and compute loss.
+    logits = layers.dense(h_fc1, 10, activation=None)
+    loss = tf.losses.softmax_cross_entropy(target, logits)
 
-        # Use the elastic wrapper to wrap the function to train one batch
-        elastic_train_one_batch = elastic_controller.elastic_run(
-            train_one_batch
-        )
-        for i in range(1000):
-            loss_value, _ = elastic_train_one_batch(sess, [loss, train_step])
-            logger.info("loss: {}".format(loss_value))
+    return tf.argmax(logits, 1), loss
+
+
+def train(args):
+    allreduce_controller = create_elastic_controller(
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        training_data=args.training_data,
+    )
+    dataset = create_dataset(allreduce_controller.data_shard_service)
+    dataset = feed(dataset)
+    dataset = dataset.batch(args.batch_size).prefetch(1)
+    dataset_it = dataset.make_one_shot_iterator()
+    batch_x, batch_y = dataset_it.get_next()
+    batch_x = tf.cast(batch_x, tf.float32)
+
+    batch_y = tf.reshape(batch_y, (-1,))
+    image = tf.reshape(batch_x, (-1, 784))
+    predict, loss = conv_model(image, batch_y, tf.estimator.ModeKeys.TRAIN)
+    optimizer = tf.train.GradientDescentOptimizer(0.1)
+    optimizer = DistributedOptimizer(optimizer, fixed_global_batch_size=True)
+    global_step = tf.train.get_or_create_global_step()
+    train_step = optimizer.minimize(loss, global_step=global_step)
+
+    # Use the elastic wrapper to wrap the function to train one batch
+    elastic_train_one_batch = allreduce_controller.elastic_run(train_one_batch)
+    hook = AdjustBackwardPassesPerStepHook(optimizer)
+    allreduce_controller.set_broadcast_variables(tf.global_variables())
+    with allreduce_controller.scope():
+        with tf.train.MonitoredTrainingSession(hooks=[hook]) as sess:
+            allreduce_controller.set_session(sess)
+            try:
+                while True:
+                    loss_value, step, _ = elastic_train_one_batch(
+                        sess, [loss, global_step, train_step]
+                    )
+                    logger.info(
+                        "global step = {}. loss: {}".format(step, loss_value)
+                    )
+            except tf.errors.OutOfRangeError:
+                print("end!")
 
 
 def train_one_batch(sess, run_tensors):
     return sess.run(run_tensors)
 
 
-def feed(dataset, mode, _):
+def feed(dataset):
     dataset = dataset.map(_parse_data)
-
-    if mode == Mode.TRAINING:
-        dataset = dataset.shuffle(buffer_size=1024)
+    dataset = dataset.shuffle(buffer_size=1024)
     return dataset
 
 
@@ -83,3 +174,30 @@ def eval_metrics_fn():
             tf.cast(tf.reshape(labels, [-1]), tf.int32),
         )
     }
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser(description="Process training parameters")
+    parser.add_argument("--batch_size", type=int, default=64, required=False)
+    parser.add_argument("--num_epochs", type=int, default=1, required=False)
+    parser.add_argument(
+        "--learning_rate", type=float, default=0.1, required=False
+    )
+    parser.add_argument(
+        "--no-cuda",
+        action="store_true",
+        default=False,
+        help="disable CUDA training",
+    )
+    parser.add_argument("--training_data", type=str, required=True)
+    parser.add_argument(
+        "--validation_data", type=str, default="", required=False
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = arg_parser()
+    args = parser.parse_args()
+    print(args)
+    train(args)