Skip to content

Commit 870c510

Browse files
authored
replace horovod by estimator (#732)
* replace horovod by estimator * remove useless functions * remove useless hparam * update readme * fix style * revert dataloader * update comment * enable eager execution * use old batch norm * remove tf.function
1 parent cdec8d6 commit 870c510

File tree

9 files changed

+103
-137
lines changed

9 files changed

+103
-137
lines changed

efficientdet/README.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -312,8 +312,6 @@ You should check more details of runmode which is written in caption-4.
312312

313313
## 9. Train on multi GPUs.
314314

315-
Install [horovod](https://github.com/horovod/horovod#id6).
316-
317315
Create a config file for the PASCAL VOC dataset called voc_config.yaml and put this in it.
318316

319317
num_classes: 21
@@ -327,7 +325,7 @@ Download efficientdet coco checkpoint.
327325

328326
Finetune needs to use --ckpt rather than --backbone_ckpt.
329327

330-
!horovodrun -np <num_gpus> -H localhost:<num_gpus> python main.py --mode=train \
328+
python main.py --mode=train \
331329
--training_file_pattern=tfrecord/pascal*.tfrecord \
332330
--validation_file_pattern=tfrecord/pascal*.tfrecord \
333331
--model_name=efficientdet-d0 \
@@ -337,7 +335,7 @@ Finetune needs to use --ckpt rather than --backbone_ckpt.
337335
--eval_batch_size=64 --eval_samples=1024 \
338336
--num_examples_per_epoch=5717 --num_epochs=50 \
339337
--hparams=voc_config.yaml
340-
--strategy=horovod
338+
--strategy=gpus
341339

342340
If you want to do inference for custom data, you can run
343341

efficientdet/aug/autoaugment.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,6 +1633,7 @@ def final_policy(image_, bboxes_):
16331633
return (augmented_images, augmented_bboxes)
16341634

16351635

1636+
@tf.autograph.experimental.do_not_convert
16361637
def distort_image_with_autoaugment(image,
16371638
bboxes,
16381639
augmentation_name,

efficientdet/dataloader.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from object_detection import tf_example_decoder
2323

2424

25-
class InputProcessor(object):
25+
class InputProcessor:
2626
"""Base class of Input processor."""
2727

2828
def __init__(self, image, output_size):
@@ -207,12 +207,10 @@ def offset_y(self):
207207

208208
def pad_to_fixed_size(data, pad_value, output_shape):
209209
"""Pad data to a fixed length at the first dimension.
210-
211210
Args:
212211
data: Tensor to be padded to output_shape.
213212
pad_value: A constant value assigned to the paddings.
214213
output_shape: The output shape of a 2D tensor.
215-
216214
Returns:
217215
The Padded tensor with output_shape [max_instances_per_image, dimension].
218216
"""
@@ -230,7 +228,7 @@ def pad_to_fixed_size(data, pad_value, output_shape):
230228
return padded_data
231229

232230

233-
class InputReader(object):
231+
class InputReader:
234232
"""Input reader for dataset."""
235233

236234
def __init__(self,
@@ -374,7 +372,8 @@ def process_example(self, params, batch_size, images, cls_targets,
374372
labels['image_masks'] = image_masks
375373
return images, labels
376374

377-
def __call__(self, params):
375+
376+
def __call__(self, params, input_context=None):
378377
input_anchors = anchors.Anchors(params['min_level'], params['max_level'],
379378
params['num_scales'],
380379
params['aspect_ratios'],
@@ -391,7 +390,9 @@ def __call__(self, params):
391390
self._file_pattern, shuffle=self._is_training)
392391
if self._is_training:
393392
dataset = dataset.repeat()
394-
393+
if input_context:
394+
dataset = dataset.shard(input_context.num_input_pipelines,
395+
input_context.input_pipeline_id)
395396
# Prefetch data from files.
396397
def _prefetch_dataset(filename):
397398
if params.get('dataset_type', None) == 'sstable':
@@ -404,6 +405,9 @@ def _prefetch_dataset(filename):
404405
_prefetch_dataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
405406
options = tf.data.Options()
406407
options.experimental_deterministic = not self._is_training
408+
options.experimental_optimization.map_vectorization.enabled = True
409+
options.experimental_optimization.map_parallelization = True
410+
options.experimental_optimization.parallel_batch = True
407411
dataset = dataset.with_options(options)
408412
if self._is_training:
409413
dataset = dataset.shuffle(64)
@@ -429,4 +433,5 @@ def _prefetch_dataset(filename):
429433
# first batch. This reduces variance in performance and is useful in
430434
# testing.
431435
dataset = dataset.take(1).cache().repeat()
436+
dataset = dataset.apply(tf.data.experimental.ignore_errors())
432437
return dataset

efficientdet/det_model_fn.py

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -405,9 +405,7 @@ def model_fn(inputs):
405405
ema = tf.train.ExponentialMovingAverage(
406406
decay=moving_average_decay, num_updates=global_step)
407407
ema_vars = utils.get_ema_vars()
408-
if params['strategy'] == 'horovod':
409-
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
410-
learning_rate = learning_rate * hvd.size()
408+
411409
if mode == tf.estimator.ModeKeys.TRAIN:
412410
if params['optimizer'].lower() == 'sgd':
413411
optimizer = tf.train.MomentumOptimizer(
@@ -419,9 +417,6 @@ def model_fn(inputs):
419417

420418
if params['strategy'] == 'tpu':
421419
optimizer = tf.tpu.CrossShardOptimizer(optimizer)
422-
elif params['strategy'] == 'horovod':
423-
optimizer = hvd.DistributedOptimizer(optimizer)
424-
training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
425420

426421
# Batch norm requires update_ops to be added as a train_op dependency.
427422
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
@@ -577,7 +572,6 @@ def scaffold_fn():
577572
skip_mismatch=params['skip_mismatch'])
578573

579574
tf.train.init_from_checkpoint(checkpoint, var_map)
580-
581575
return tf.train.Scaffold()
582576
elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:
583577

@@ -592,21 +586,22 @@ def scaffold_fn():
592586

593587
if params['strategy'] != 'tpu':
594588
# Profile every 1K steps.
595-
profile_hook = tf.train.ProfilerHook(
596-
save_steps=1000, output_dir=params['model_dir'])
597-
training_hooks.append(profile_hook)
589+
if params.get('profile', False):
590+
profile_hook = tf.estimator.ProfilerHook(
591+
save_steps=1000, output_dir=params['model_dir'], show_memory=True)
592+
training_hooks.append(profile_hook)
598593

599-
# Report memory allocation if OOM
600-
class OomReportingHook(tf.estimator.SessionRunHook):
594+
# Report memory allocation if OOM
595+
class OomReportingHook(tf.estimator.SessionRunHook):
601596

602-
def before_run(self, run_context):
603-
return tf.estimator.SessionRunArgs(
604-
fetches=[],
605-
options=tf.RunOptions(report_tensor_allocations_upon_oom=True))
597+
def before_run(self, run_context):
598+
return tf.estimator.SessionRunArgs(
599+
fetches=[],
600+
options=tf.RunOptions(report_tensor_allocations_upon_oom=True))
606601

607-
training_hooks.append(OomReportingHook())
602+
training_hooks.append(OomReportingHook())
608603

609-
logging_hook = tf.train.LoggingTensorHook(
604+
logging_hook = tf.estimator.LoggingTensorHook(
610605
{
611606
'step': global_step,
612607
'det_loss': det_loss,
@@ -616,15 +611,24 @@ def before_run(self, run_context):
616611
every_n_iter=params.get('iterations_per_loop', 100),
617612
)
618613
training_hooks.append(logging_hook)
619-
620-
return tf.estimator.tpu.TPUEstimatorSpec(
621-
mode=mode,
622-
loss=total_loss,
623-
train_op=train_op,
624-
eval_metrics=eval_metrics,
625-
host_call=utils.get_tpu_host_call(global_step, params),
626-
scaffold_fn=scaffold_fn,
627-
training_hooks=training_hooks)
614+
if params['strategy'] == 'tpu':
615+
return tf.estimator.tpu.TPUEstimatorSpec(
616+
mode=mode,
617+
loss=total_loss,
618+
train_op=train_op,
619+
eval_metrics=eval_metrics,
620+
host_call=utils.get_tpu_host_call(global_step, params),
621+
scaffold_fn=scaffold_fn,
622+
training_hooks=training_hooks)
623+
else:
624+
eval_metric_ops = eval_metrics[0](eval_metrics[1]) if eval_metrics else None
625+
return tf.estimator.EstimatorSpec(
626+
mode=mode,
627+
loss=total_loss,
628+
train_op=train_op,
629+
eval_metric_ops=eval_metric_ops,
630+
scaffold=scaffold_fn(),
631+
training_hooks=training_hooks)
628632

629633

630634
def efficientdet_model_fn(features, labels, mode, params):

efficientdet/efficientdet_arch.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -52,34 +52,6 @@ def freeze_vars(variables, pattern):
5252
return variables
5353

5454

55-
def resize_bilinear(images, size, output_type):
56-
"""Returns resized images as output_type."""
57-
images = tf.image.resize_bilinear(images, size, align_corners=True)
58-
return tf.cast(images, output_type)
59-
60-
61-
def remove_variables(variables, resnet_depth=50):
62-
"""Removes low-level variables from the input.
63-
64-
Removing low-level parameters (e.g., initial convolution layer) from training
65-
usually leads to higher training speed and slightly better testing accuracy.
66-
The intuition is that the low-level architecture (e.g., ResNet-50) is able to
67-
capture low-level features such as edges; therefore, it does not need to be
68-
fine-tuned for the detection task.
69-
70-
Args:
71-
variables: all the variables in training
72-
resnet_depth: the depth of ResNet model
73-
74-
Returns:
75-
var_list: a list containing variables for training
76-
77-
"""
78-
var_list = [v for v in variables
79-
if v.name.find('resnet%s/conv2d/' % resnet_depth) == -1]
80-
return var_list
81-
82-
8355
def resample_feature_map(feat,
8456
name,
8557
target_height,

efficientdet/hparams_config.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,6 @@ def default_detection_configs():
278278
h.dataset_type = None
279279
h.positives_momentum = None
280280

281-
# unused.
282-
h.resnet_depth = 50
283281
return h
284282

285283

efficientdet/keras/util_keras.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,7 @@ def build_batch_norm(is_training_bn: bool,
4444
A normalized `Tensor` with the same `data_format`.
4545
"""
4646
axis = 1 if data_format == 'channels_first' else -1
47-
if is_training_bn and strategy in ('gpus',):
48-
batch_norm_class = tf.keras.layers.experimental.SyncBatchNormalization
49-
elif (not tf.compat.v1.executing_eagerly_outside_functions() or
50-
(is_training_bn and strategy in ('tpu',))):
51-
# TODO(tanmingxing): compare them on TPU.
52-
batch_norm_class = utils.batch_norm_class(is_training_bn, strategy)
53-
else:
54-
batch_norm_class = tf.keras.layers.BatchNormalization
47+
batch_norm_class = utils.batch_norm_class(is_training_bn, strategy)
5548

5649
bn_layer = batch_norm_class(
5750
axis=axis,

0 commit comments

Comments
 (0)