@@ -405,9 +405,7 @@ def model_fn(inputs):
405405 ema = tf .train .ExponentialMovingAverage (
406406 decay = moving_average_decay , num_updates = global_step )
407407 ema_vars = utils .get_ema_vars ()
408- if params ['strategy' ] == 'horovod' :
409- import horovod .tensorflow as hvd # pylint: disable=g-import-not-at-top
410- learning_rate = learning_rate * hvd .size ()
408+
411409 if mode == tf .estimator .ModeKeys .TRAIN :
412410 if params ['optimizer' ].lower () == 'sgd' :
413411 optimizer = tf .train .MomentumOptimizer (
@@ -419,9 +417,6 @@ def model_fn(inputs):
419417
420418 if params ['strategy' ] == 'tpu' :
421419 optimizer = tf .tpu .CrossShardOptimizer (optimizer )
422- elif params ['strategy' ] == 'horovod' :
423- optimizer = hvd .DistributedOptimizer (optimizer )
424- training_hooks = [hvd .BroadcastGlobalVariablesHook (0 )]
425420
426421 # Batch norm requires update_ops to be added as a train_op dependency.
427422 update_ops = tf .get_collection (tf .GraphKeys .UPDATE_OPS )
@@ -577,7 +572,6 @@ def scaffold_fn():
577572 skip_mismatch = params ['skip_mismatch' ])
578573
579574 tf .train .init_from_checkpoint (checkpoint , var_map )
580-
581575 return tf .train .Scaffold ()
582576 elif mode == tf .estimator .ModeKeys .EVAL and moving_average_decay :
583577
@@ -592,21 +586,22 @@ def scaffold_fn():
592586
593587 if params ['strategy' ] != 'tpu' :
594588 # Profile every 1K steps.
595- profile_hook = tf .train .ProfilerHook (
596- save_steps = 1000 , output_dir = params ['model_dir' ])
597- training_hooks .append (profile_hook )
589+ if params .get ('profile' , False ):
590+ profile_hook = tf .estimator .ProfilerHook (
591+ save_steps = 1000 , output_dir = params ['model_dir' ], show_memory = True )
592+ training_hooks .append (profile_hook )
598593
599- # Report memory allocation if OOM
600- class OomReportingHook (tf .estimator .SessionRunHook ):
594+ # Report memory allocation if OOM
595+ class OomReportingHook (tf .estimator .SessionRunHook ):
601596
602- def before_run (self , run_context ):
603- return tf .estimator .SessionRunArgs (
604- fetches = [],
605- options = tf .RunOptions (report_tensor_allocations_upon_oom = True ))
597+ def before_run (self , run_context ):
598+ return tf .estimator .SessionRunArgs (
599+ fetches = [],
600+ options = tf .RunOptions (report_tensor_allocations_upon_oom = True ))
606601
607- training_hooks .append (OomReportingHook ())
602+ training_hooks .append (OomReportingHook ())
608603
609- logging_hook = tf .train .LoggingTensorHook (
604+ logging_hook = tf .estimator .LoggingTensorHook (
610605 {
611606 'step' : global_step ,
612607 'det_loss' : det_loss ,
@@ -616,15 +611,24 @@ def before_run(self, run_context):
616611 every_n_iter = params .get ('iterations_per_loop' , 100 ),
617612 )
618613 training_hooks .append (logging_hook )
619-
620- return tf .estimator .tpu .TPUEstimatorSpec (
621- mode = mode ,
622- loss = total_loss ,
623- train_op = train_op ,
624- eval_metrics = eval_metrics ,
625- host_call = utils .get_tpu_host_call (global_step , params ),
626- scaffold_fn = scaffold_fn ,
627- training_hooks = training_hooks )
614+ if params ['strategy' ] == 'tpu' :
615+ return tf .estimator .tpu .TPUEstimatorSpec (
616+ mode = mode ,
617+ loss = total_loss ,
618+ train_op = train_op ,
619+ eval_metrics = eval_metrics ,
620+ host_call = utils .get_tpu_host_call (global_step , params ),
621+ scaffold_fn = scaffold_fn ,
622+ training_hooks = training_hooks )
623+ else :
624+ eval_metric_ops = eval_metrics [0 ](eval_metrics [1 ]) if eval_metrics else None
625+ return tf .estimator .EstimatorSpec (
626+ mode = mode ,
627+ loss = total_loss ,
628+ train_op = train_op ,
629+ eval_metric_ops = eval_metric_ops ,
630+ scaffold = scaffold_fn (),
631+ training_hooks = training_hooks )
628632
629633
630634def efficientdet_model_fn (features , labels , mode , params ):
0 commit comments