replace horovod by estimator (#732)

fsx950223 · web-flow · commit 870c5106b391 · 2020-09-06T21:21:22.000-07:00
* replace horovod by estimator

* remove useless functions

* remove useless hparam

* update readme

* fix style

* revert dataloader

* update comment

* enable eager execution

* use old batch norm

* remove tf.function
diff --git a/efficientdet/README.md b/efficientdet/README.md
@@ -312,8 +312,6 @@ You should check more details of runmode which is written in caption-4.
 
 ## 9. Train on multi GPUs.
 
-Install [horovod](https://github.com/horovod/horovod#id6).
-
 Create a config file for the PASCAL VOC dataset called voc_config.yaml and put this in it.
 
       num_classes: 21
@@ -327,7 +325,7 @@ Download efficientdet coco checkpoint.
 
 Finetune needs to use --ckpt rather than --backbone_ckpt.
 
-    !horovodrun -np <num_gpus> -H localhost:<num_gpus> python main.py --mode=train \
+    python main.py --mode=train \
         --training_file_pattern=tfrecord/pascal*.tfrecord \
         --validation_file_pattern=tfrecord/pascal*.tfrecord \
         --model_name=efficientdet-d0 \
@@ -337,7 +335,7 @@ Finetune needs to use --ckpt rather than --backbone_ckpt.
         --eval_batch_size=64 --eval_samples=1024 \
         --num_examples_per_epoch=5717 --num_epochs=50  \
         --hparams=voc_config.yaml
-        --strategy=horovod
+        --strategy=gpus
 
 If you want to do inference for custom data, you can run
 
diff --git a/efficientdet/aug/autoaugment.py b/efficientdet/aug/autoaugment.py
@@ -1633,6 +1633,7 @@ def final_policy(image_, bboxes_):
   return (augmented_images, augmented_bboxes)
 
 
+@tf.autograph.experimental.do_not_convert
 def distort_image_with_autoaugment(image,
                                    bboxes,
                                    augmentation_name,
diff --git a/efficientdet/dataloader.py b/efficientdet/dataloader.py
@@ -22,7 +22,7 @@
 from object_detection import tf_example_decoder
 
 
-class InputProcessor(object):
+class InputProcessor:
   """Base class of Input processor."""
 
   def __init__(self, image, output_size):
@@ -207,12 +207,10 @@ def offset_y(self):
 
 def pad_to_fixed_size(data, pad_value, output_shape):
   """Pad data to a fixed length at the first dimension.
-
   Args:
     data: Tensor to be padded to output_shape.
     pad_value: A constant value assigned to the paddings.
     output_shape: The output shape of a 2D tensor.
-
   Returns:
     The Padded tensor with output_shape [max_instances_per_image, dimension].
   """
@@ -230,7 +228,7 @@ def pad_to_fixed_size(data, pad_value, output_shape):
   return padded_data
 
 
-class InputReader(object):
+class InputReader:
   """Input reader for dataset."""
 
   def __init__(self,
@@ -374,7 +372,8 @@ def process_example(self, params, batch_size, images, cls_targets,
     labels['image_masks'] = image_masks
     return images, labels
 
-  def __call__(self, params):
+
+  def __call__(self, params, input_context=None):
     input_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
@@ -391,7 +390,9 @@ def __call__(self, params):
         self._file_pattern, shuffle=self._is_training)
     if self._is_training:
       dataset = dataset.repeat()
-
+    if input_context:
+      dataset = dataset.shard(input_context.num_input_pipelines,
+                              input_context.input_pipeline_id)
     # Prefetch data from files.
     def _prefetch_dataset(filename):
       if params.get('dataset_type', None) == 'sstable':
@@ -404,6 +405,9 @@ def _prefetch_dataset(filename):
         _prefetch_dataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
     options = tf.data.Options()
     options.experimental_deterministic = not self._is_training
+    options.experimental_optimization.map_vectorization.enabled = True
+    options.experimental_optimization.map_parallelization = True
+    options.experimental_optimization.parallel_batch = True
     dataset = dataset.with_options(options)
     if self._is_training:
       dataset = dataset.shuffle(64)
@@ -429,4 +433,5 @@ def _prefetch_dataset(filename):
       # first batch. This reduces variance in performance and is useful in
       # testing.
       dataset = dataset.take(1).cache().repeat()
+    dataset = dataset.apply(tf.data.experimental.ignore_errors())
     return dataset
diff --git a/efficientdet/det_model_fn.py b/efficientdet/det_model_fn.py
@@ -405,9 +405,7 @@ def model_fn(inputs):
     ema = tf.train.ExponentialMovingAverage(
         decay=moving_average_decay, num_updates=global_step)
     ema_vars = utils.get_ema_vars()
-  if params['strategy'] == 'horovod':
-    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-    learning_rate = learning_rate * hvd.size()
+
   if mode == tf.estimator.ModeKeys.TRAIN:
     if params['optimizer'].lower() == 'sgd':
       optimizer = tf.train.MomentumOptimizer(
@@ -419,9 +417,6 @@ def model_fn(inputs):
 
     if params['strategy'] == 'tpu':
       optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-    elif params['strategy'] == 'horovod':
-      optimizer = hvd.DistributedOptimizer(optimizer)
-      training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
 
     # Batch norm requires update_ops to be added as a train_op dependency.
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
@@ -577,7 +572,6 @@ def scaffold_fn():
           skip_mismatch=params['skip_mismatch'])
 
       tf.train.init_from_checkpoint(checkpoint, var_map)
-
       return tf.train.Scaffold()
   elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:
 
@@ -592,21 +586,22 @@ def scaffold_fn():
 
   if params['strategy'] != 'tpu':
     # Profile every 1K steps.
-    profile_hook = tf.train.ProfilerHook(
-        save_steps=1000, output_dir=params['model_dir'])
-    training_hooks.append(profile_hook)
+    if params.get('profile', False):
+      profile_hook = tf.estimator.ProfilerHook(
+          save_steps=1000, output_dir=params['model_dir'], show_memory=True)
+      training_hooks.append(profile_hook)
 
-    # Report memory allocation if OOM
-    class OomReportingHook(tf.estimator.SessionRunHook):
+      # Report memory allocation if OOM
+      class OomReportingHook(tf.estimator.SessionRunHook):
 
-      def before_run(self, run_context):
-        return tf.estimator.SessionRunArgs(
-            fetches=[],
-            options=tf.RunOptions(report_tensor_allocations_upon_oom=True))
+        def before_run(self, run_context):
+          return tf.estimator.SessionRunArgs(
+              fetches=[],
+              options=tf.RunOptions(report_tensor_allocations_upon_oom=True))
 
-    training_hooks.append(OomReportingHook())
+      training_hooks.append(OomReportingHook())
 
-    logging_hook = tf.train.LoggingTensorHook(
+    logging_hook = tf.estimator.LoggingTensorHook(
         {
             'step': global_step,
             'det_loss': det_loss,
@@ -616,15 +611,24 @@ def before_run(self, run_context):
         every_n_iter=params.get('iterations_per_loop', 100),
     )
     training_hooks.append(logging_hook)
-
-  return tf.estimator.tpu.TPUEstimatorSpec(
-      mode=mode,
-      loss=total_loss,
-      train_op=train_op,
-      eval_metrics=eval_metrics,
-      host_call=utils.get_tpu_host_call(global_step, params),
-      scaffold_fn=scaffold_fn,
-      training_hooks=training_hooks)
+  if params['strategy'] == 'tpu':
+    return tf.estimator.tpu.TPUEstimatorSpec(
+        mode=mode,
+        loss=total_loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        host_call=utils.get_tpu_host_call(global_step, params),
+        scaffold_fn=scaffold_fn,
+        training_hooks=training_hooks)
+  else:
+    eval_metric_ops = eval_metrics[0](eval_metrics[1]) if eval_metrics else None
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        loss=total_loss,
+        train_op=train_op,
+        eval_metric_ops=eval_metric_ops,
+        scaffold=scaffold_fn(),
+        training_hooks=training_hooks)
 
 
 def efficientdet_model_fn(features, labels, mode, params):
diff --git a/efficientdet/efficientdet_arch.py b/efficientdet/efficientdet_arch.py
@@ -52,34 +52,6 @@ def freeze_vars(variables, pattern):
   return variables
 
 
-def resize_bilinear(images, size, output_type):
-  """Returns resized images as output_type."""
-  images = tf.image.resize_bilinear(images, size, align_corners=True)
-  return tf.cast(images, output_type)
-
-
-def remove_variables(variables, resnet_depth=50):
-  """Removes low-level variables from the input.
-
-  Removing low-level parameters (e.g., initial convolution layer) from training
-  usually leads to higher training speed and slightly better testing accuracy.
-  The intuition is that the low-level architecture (e.g., ResNet-50) is able to
-  capture low-level features such as edges; therefore, it does not need to be
-  fine-tuned for the detection task.
-
-  Args:
-    variables: all the variables in training
-    resnet_depth: the depth of ResNet model
-
-  Returns:
-    var_list: a list containing variables for training
-
-  """
-  var_list = [v for v in variables
-              if v.name.find('resnet%s/conv2d/' % resnet_depth) == -1]
-  return var_list
-
-
 def resample_feature_map(feat,
                          name,
                          target_height,
diff --git a/efficientdet/hparams_config.py b/efficientdet/hparams_config.py
@@ -278,8 +278,6 @@ def default_detection_configs():
   h.dataset_type = None
   h.positives_momentum = None
 
-  # unused.
-  h.resnet_depth = 50
   return h
 
 
diff --git a/efficientdet/keras/util_keras.py b/efficientdet/keras/util_keras.py
@@ -44,14 +44,7 @@ def build_batch_norm(is_training_bn: bool,
     A normalized `Tensor` with the same `data_format`.
   """
   axis = 1 if data_format == 'channels_first' else -1
-  if is_training_bn and strategy in ('gpus',):
-    batch_norm_class = tf.keras.layers.experimental.SyncBatchNormalization
-  elif (not tf.compat.v1.executing_eagerly_outside_functions() or
-        (is_training_bn and strategy in ('tpu',))):
-    # TODO(tanmingxing): compare them on TPU.
-    batch_norm_class = utils.batch_norm_class(is_training_bn, strategy)
-  else:
-    batch_norm_class = tf.keras.layers.BatchNormalization
+  batch_norm_class = utils.batch_norm_class(is_training_bn, strategy)
 
   bn_layer = batch_norm_class(
       axis=axis,
diff --git a/efficientdet/main.py b/efficientdet/main.py
diff --git a/efficientdet/utils.py b/efficientdet/utils.py