Merge pull request #742 from google/est

mingxingtan · web-flow · commit cdec8d626072 · 2020-09-06T17:51:38.000-07:00
Simplify estimator train_and_eval.
diff --git a/efficientdet/det_model_fn.py b/efficientdet/det_model_fn.py
@@ -343,6 +343,7 @@ def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
   """
   utils.image('input_image', features)
   training_hooks = []
+  params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)
 
   if params['use_keras_model']:
     def model_fn(inputs):
diff --git a/efficientdet/main.py b/efficientdet/main.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """The main training script."""
-import multiprocessing
 import os
 from absl import app
 from absl import flags
@@ -137,13 +136,12 @@ def main(_):
     tpu_cluster_resolver = None
 
   # Check data path
-  if FLAGS.mode in ('train',
-                    'train_and_eval') and FLAGS.training_file_pattern is None:
-    raise RuntimeError('You must specify --training_file_pattern for training.')
+  if FLAGS.mode in ('train', 'train_and_eval'):
+    if FLAGS.training_file_pattern is None:
+      raise RuntimeError('Must specify --training_file_pattern for train.')
   if FLAGS.mode in ('eval', 'train_and_eval'):
     if FLAGS.validation_file_pattern is None:
-      raise RuntimeError('You must specify --validation_file_pattern '
-                         'for evaluation.')
+      raise RuntimeError('Must specify --validation_file_pattern for eval.')
 
   # Parse and override hparams
   config = hparams_config.get_detection_config(FLAGS.model_name)
@@ -173,15 +171,6 @@ def main(_):
         'image_scales': None,
     }
     # The Input Partition Logic: We partition only the partition-able tensors.
-    # Spatial partition requires that the to-be-partitioned tensors must have a
-    # dimension that is a multiple of `partition_dims`. Depending on the
-    # `partition_dims` and the `image_size` and the `max_level` in config, some
-    # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
-    # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
-    # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
-    # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
-    # case, the level-8 and level-9 target tensors are not partition-able, and
-    # the highest partition-able level is 7.
     feat_sizes = utils.get_feat_sizes(
         config.get('image_size'), config.get('max_level'))
     for level in range(config.get('min_level'), config.get('max_level') + 1):
@@ -254,56 +243,36 @@ def _can_partition(spatial_dim):
   model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
   max_instances_per_image = config.max_instances_per_image
   eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
+  total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
+  train_steps = total_examples // FLAGS.train_batch_size
   use_tpu = (FLAGS.strategy == 'tpu')
   logging.info(params)
 
-  def _train(steps):
-    """Build train estimator and run training if steps > 0."""
-    train_estimator = tf.estimator.tpu.TPUEstimator(
-        model_fn=model_fn_instance,
-        use_tpu=use_tpu,
-        train_batch_size=FLAGS.train_batch_size,
-        config=run_config,
-        params=params)
-    train_estimator.train(
-        input_fn=dataloader.InputReader(
-            FLAGS.training_file_pattern,
-            is_training=True,
-            use_fake_data=FLAGS.use_fake_data,
-            max_instances_per_image=max_instances_per_image),
-        max_steps=steps)
-
-  def _eval(steps):
-    """Build estimator and eval the latest checkpoint if steps > 0."""
-    eval_params = dict(
-        params,
-        strategy=FLAGS.strategy,
-        input_rand_hflip=False,
-        is_training_bn=False,
-    )
-    eval_estimator = tf.estimator.tpu.TPUEstimator(
-        model_fn=model_fn_instance,
-        use_tpu=use_tpu,
-        train_batch_size=FLAGS.train_batch_size,
-        eval_batch_size=FLAGS.eval_batch_size,
-        config=run_config,
-        params=eval_params)
-    eval_results = eval_estimator.evaluate(
-        input_fn=dataloader.InputReader(
-            FLAGS.validation_file_pattern,
-            is_training=False,
-            max_instances_per_image=max_instances_per_image),
-        steps=steps,
-        name=FLAGS.eval_name)
-    logging.info('Evaluation results: %s', eval_results)
-    return eval_results
+  # Use the unified estimator, train, and eval interfaces.
+  estimator = tf.estimator.tpu.TPUEstimator(
+      model_fn=model_fn_instance,
+      use_tpu=use_tpu,
+      train_batch_size=FLAGS.train_batch_size,
+      eval_batch_size=FLAGS.eval_batch_size,
+      config=run_config,
+      params=params)
+  train_input_fn = dataloader.InputReader(
+      FLAGS.training_file_pattern,
+      is_training=True,
+      use_fake_data=FLAGS.use_fake_data,
+      max_instances_per_image=max_instances_per_image)
+  eval_input_fn = dataloader.InputReader(
+      FLAGS.validation_file_pattern,
+      is_training=False,
+      use_fake_data=FLAGS.use_fake_data,
+      max_instances_per_image=max_instances_per_image)
 
   # start train/eval flow.
   if FLAGS.mode == 'train':
-    total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
-    _train(total_examples // FLAGS.train_batch_size)
+    total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch),
+    estimator.train(input_fn=train_input_fn, max_steps=train_steps)
     if FLAGS.eval_after_training:
-      _eval(eval_steps)
+      estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
 
   elif FLAGS.mode == 'eval':
     # Run evaluation when there's a new checkpoint
@@ -314,7 +283,7 @@ def _eval(steps):
 
       logging.info('Starting to evaluate.')
       try:
-        eval_results = _eval(eval_steps)
+        eval_results = estimator.evaluate(eval_input_fn, steps=eval_steps)
         # Terminate eval job when final checkpoint is reached.
         try:
           current_step = int(os.path.basename(ckpt).split('-')[1])
@@ -323,53 +292,21 @@ def _eval(steps):
           break
 
         utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
-        total_step = int((config.num_epochs * FLAGS.num_examples_per_epoch) /
-                         FLAGS.train_batch_size)
-        if current_step >= total_step:
-          logging.info('Evaluation finished after training step %d',
-                       current_step)
+        if current_step >= train_steps:
+          logging.info('Eval finished step %d/%d', current_step, train_steps)
           break
 
       except tf.errors.NotFoundError:
-        # Since the coordinator is on a different job than the TPU worker,
-        # sometimes the TPU worker does not finish initializing until long after
-        # the CPU job tells it to start evaluating. In this case, the checkpoint
-        # file could have been deleted already.
+        # Checkpoint might be not already deleted by the time eval finished.
+        # We simply skip ssuch case.
         logging.info('Checkpoint %s no longer exists, skipping.', ckpt)
 
   elif FLAGS.mode == 'train_and_eval':
-    ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
-    if not ckpt and FLAGS.ckpt:
-      # Load the pretrained ckpt from FLAGS.ckpt at the begining of training.
-      ckpt = tf.train.latest_checkpoint(FLAGS.ckpt)
-    try:
-      step = int(os.path.basename(ckpt).split('-')[1])
-      current_epoch = (
-          step * FLAGS.train_batch_size // FLAGS.num_examples_per_epoch)
-      logging.info('found ckpt at step %d (epoch %d)', step, current_epoch)
-    except (IndexError, TypeError):
-      logging.info('Folder %s has no ckpt with valid step.', FLAGS.model_dir)
-      current_epoch = 0
-
-    def run_train_and_eval(e):
-      print('-----------------------------------------------------\n'
-            '=====> Starting training, epoch: %d.' % e)
-      _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size)
-      print('-----------------------------------------------------\n'
-            '=====> Starting evaluation, epoch: %d.' % e)
-      eval_results = _eval(eval_steps)
-      ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
-      utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
-
-    epochs_per_cycle = 1  # higher number has less graph construction overhead.
-    for e in range(current_epoch + 1, config.num_epochs + 1, epochs_per_cycle):
-      if FLAGS.run_epoch_in_child_process:
-        p = multiprocessing.Process(target=run_train_and_eval, args=(e,))
-        p.start()
-        p.join()
-      else:
-        run_train_and_eval(e)
-
+    train_spec = tf.estimator.TrainSpec(
+        input_fn=train_input_fn, max_steps=train_steps)
+    eval_spec = tf.estimator.EvalSpec(
+        input_fn=eval_input_fn, steps=eval_steps, throttle_secs=600)
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
   else:
     logging.info('Invalid mode: %s', FLAGS.mode)