fix #889 (#890)

fsx950223 · web-flow · commit e494379a0852 · 2020-11-30T17:28:18.000+08:00
* fix #889 * revert changes and add limits to readme * add limits * update readme
diff --git a/efficientdet/README.md b/efficientdet/README.md
@@ -369,15 +369,15 @@ For more instructions about training on TPUs, please refer to the following tuto
 
   * EfficientNet tutorial: https://cloud.google.com/tpu/docs/tutorials/efficientnet
 
-## 11. Reducing Memory Usage when Training EfficientDets on GPU.
+## 11. Reducing Memory Usage when Training EfficientDets on GPU. (The current approach doesn't support mirrored multi GPU or mixed-precision training)
 
 EfficientDets use a lot of GPU memory for a few reasons:
 
 * Large input resolution: because resolution is one of the scaling dimension, our resolution tends to be higher, which significantly increase activations (although no parameter increase).
 * Large internal activations for backbone: our backbone uses a relatively large expansion ratio (6), causing the large expanded activations.
 * Deep BiFPN: our BiFPN has multiple top-down and bottom-up paths, which leads to a lot of intermediate memory usage during training.
 
-To train this model on GPU with low memory there is an experimental option gradient_checkpointing.
+To train this model on GPU with low memory there is an experimental option grad_checkpoint.
 
 Check these links for a high-level idea of what gradient checkpointing is doing:
 1. https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9
@@ -387,7 +387,6 @@ Check these links for a high-level idea of what gradient checkpointing is doing:
 If set to True, keras model uses ```tf.recompute_grad``` to achieve gradient checkpoints.
 
 Testing shows that:
-* It allows to train a d7x network with batch size of 2 by keras/train.py on a 11Gb (1080Ti) GPU
 * It also allows to train a d6 network with batch size of 2 by main.py on a 11Gb (1080Ti) GPU
 
 ## 12. Visualize TF-Records.
diff --git a/efficientdet/det_model_fn.py b/efficientdet/det_model_fn.py
@@ -386,9 +386,6 @@ def model_fn(inputs):
 
     if is_tpu:
       optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-    elif params['mixed_precision']:
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          optimizer)
 
     # Batch norm requires update_ops to be added as a train_op dependency.
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
diff --git a/efficientdet/main.py b/efficientdet/main.py
@@ -127,8 +127,6 @@ def main(_):
     tpu_grpc_url = tpu_cluster_resolver.get_master()
     tf.Session.reset(tpu_grpc_url)
   else:
-    # Always enable auto mixed precision graph rewrite
-    os.environ['TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'] = '1'
     tpu_cluster_resolver = None
 
   # Check data path
diff --git a/efficientdet/utils.py b/efficientdet/utils.py
@@ -618,14 +618,11 @@ def build_model_with_precision(pp, mm, ii, tt, *args, **kwargs):
       outputs = mm(inputs, *args, **kwargs)
     set_precision_policy('float32')
   elif pp == 'mixed_float16':
-    if tt:
-      outputs = mm(ii, *args, **kwargs)
-    else:
-      set_precision_policy(pp, loss_scale=tt)
-      inputs = tf.cast(ii, tf.float16)
-      with float16_scope():
-        outputs = mm(inputs, *args, **kwargs)
-      set_precision_policy('float32')
+    set_precision_policy(pp, loss_scale=tt)
+    inputs = tf.cast(ii, tf.float16)
+    with float16_scope():
+      outputs = mm(inputs, *args, **kwargs)
+    set_precision_policy('float32')
   elif not pp or pp == 'float32':
     outputs = mm(ii, *args, **kwargs)
   else: