add epsilon in bn

peterzhang2029 · peterzhang2029 · commit 8a49f7f16bf1 · 2017-11-16T17:48:27.000+08:00
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  EPS = config_.epsilon();
 
   weight_.reset(new Weight(1, channels_, parameters_[0]));
   movingMean_.reset(new Weight(1, channels_, parameters_[1]));
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,6 +94,8 @@ class BatchNormBaseLayer : public Layer {
   bool useGlobalStats_;
   // use to compute moving mean and variance.
   real movingAvgFraction_;
+  // Epsilon value used in the batch normalization formula.
+  real EPS;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -22,8 +22,6 @@ namespace paddle {
 
 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
 
-const real BatchNormalizationLayer::EPS = 1E-5;
-
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ class BatchNormalizationLayer : public BatchNormBaseLayer {
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
-
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
 
-const double CudnnBatchNormLayer::EPS = 1E-5;
+const double CudnnBatchNormLayer::MIN_EPS = 1E-5;
 
 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
@@ -60,6 +60,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
   real* beta = biases_->getW()->getData();
   real* movingMean = movingMean_->getW()->getData();
   real* movingVar = movingVar_->getW()->getData();
+  EPS_ = std::max(MIN_EPS, static_cast<double>(EPS));
 
   if (!useGlobalStats_) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    1.0 - movingAvgFraction_,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   EPS_,
                                    savedMean,
                                    savedInvVar);
   } else {
@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                       beta,
                                       movingMean,
                                       movingVar,
-                                      EPS);
+                                      EPS_);
     } else {
       // There is a limitation in cudnn library.
       // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    beta,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   EPS_,
                                    batchSize,
                                    channels_,
                                    imageH_ * imageD_,
@@ -127,6 +128,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
   real* gamma = weight_->getW()->getData();
   real* savedMean = savedMean_->getData();
   real* savedInvVar = savedInvVar_->getData();
+  EPS_ = std::max(MIN_EPS, static_cast<double>(EPS));
 
   auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
     Matrix::resizeOrCreate(m, h, w, false, true);
@@ -157,7 +159,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                          gamma,
                          gammaGrad,
                          betaGrad,
-                         EPS,
+                         EPS_,
                          savedMean,
                          savedInvVar);
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -47,11 +47,14 @@ class CudnnBatchNormLayer : public BatchNormBaseLayer {
 
 protected:
   /**
-   * Epsilon value used in the batch normalization formula.
    * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
    * Same epsilon value should be used in forward and backward functions.
    */
-  static const double EPS;
+  static const double MIN_EPS;
+
+  /// Epsilon value used in the batch normalization formula.
+  /// If EPS_ is smaller than MIN_EPS, MIN_EPS will be used.
+  double EPS_;
 
   /// Input/output tensor descriptor desc
   hl_tensor_descriptor ioDesc_;
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
 
-const real MKLDNNBatchNormLayer::EPS = 1E-5;
-
 bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
                                 const ParameterMap& parameterMap) {
   if (!MKLDNNLayer::init(layerMap, parameterMap)) {
@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  EPS = config_.epsilon();
+
   VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                     << " --- global stats";
   VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -32,7 +32,8 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer {
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
   // Epsilon value used in the batch normalization formula.
-  static const real EPS;
+  real EPS;
+
   // weight and bias in paddle
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
@@ -540,6 +540,10 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for batch normalization layer
+  // small constant added to the variance to avoid numerical problems.
+  optional double epsilon = 60 [ default = 0.00001 ];
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
@@ -2434,6 +2434,7 @@ def __init__(self,
                  bias=True,
                  img3D=False,
                  use_global_stats=True,
+                 epsilon=1e-5,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
                  mean_var_names=None,
@@ -2482,6 +2483,8 @@ def __init__(self,
             self.config.use_global_stats = use_global_stats
         if moving_average_fraction is not None:
             self.config.moving_average_fraction = moving_average_fraction
+        if epsilon is not None:
+            self.config.epsilon = epsilon
 
         input_layer = self.get_input_layer(0)
         image_conf = self.config.inputs[0].image_conf
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
@@ -3036,6 +3036,7 @@ def batch_norm_layer(input,
                      param_attr=None,
                      layer_attr=None,
                      batch_norm_type=None,
+                     epsilon=1e-5,
                      moving_average_fraction=0.9,
                      use_global_stats=None,
                      mean_var_names=None):
@@ -3106,6 +3107,8 @@ def batch_norm_layer(input,
                              will use the mean and variance of the current batch
                              of test data.
     :type use_global_stats: bool | None.
+    :param epsilon: Small constant added to the variance to avoid numerical problems.
+    :type epsilon: float.
     :param moving_average_fraction: Factor used in the moving average computation.
                                    :math:`runningMean = newMean*(1-factor) + runningMean*factor`
     :type moving_average_fraction: float.
@@ -3123,6 +3126,9 @@ def batch_norm_layer(input,
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
            (batch_norm_type == "mkldnn_batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
+
+    assert epsilon >= 1e-5, "Parameter epsilon must be no less than 1e-5."
+
     l = Layer(
         name=name,
         img3D=img3D,
@@ -3132,6 +3138,7 @@ def batch_norm_layer(input,
         type=LayerType.BATCH_NORM_LAYER,
         batch_norm_type=batch_norm_type,
         bias=ParamAttr.to_bias(bias_attr),
+        epsilon=epsilon,
         moving_average_fraction=moving_average_fraction,
         use_global_stats=use_global_stats,
         mean_var_names=mean_var_names,

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,`
`41`	`41`	`useGlobalStats_ = config_.use_global_stats();`
`42`	`42`	`}`
`43`	`43`	`movingAvgFraction_ = config_.moving_average_fraction();`
	`44`	`+ EPS = config_.epsilon();`
`44`	`45`
`45`	`46`	`weight_.reset(new Weight(1, channels_, parameters_[0]));`
`46`	`47`	`movingMean_.reset(new Weight(1, channels_, parameters_[1]));`
Original file line number	Diff line number	Diff line change
`@@ -540,6 +540,10 @@ message LayerConfig {`
`540`	`540`
`541`	`541`	`// for switch order layer`
`542`	`542`	`optional ReshapeConfig reshape_conf = 59;`
	`543`	`+`
	`544`	`+ // for batch normalization layer`
	`545`	`+ // small constant added to the variance to avoid numerical problems.`
	`546`	`+ optional double epsilon = 60 [ default = 0.00001 ];`
`543`	`547`	`}`
`544`	`548`
`545`	`549`	`message EvaluatorConfig {`