updated batchnorm

hycis · hycis · commit 043b16eb8f40 · 2015-10-15T16:14:46.000+08:00
diff --git a/mozi/layers/normalization.py b/mozi/layers/normalization.py
@@ -3,75 +3,56 @@
 from mozi.utils.theano_utils import shared_zeros
 from mozi.weight_init import UniformWeight
 import theano.tensor as T
+import theano
+floatX = theano.config.floatX
 
 class BatchNormalization(Template):
-    '''
-    Adapted From keras
-    REFERENCE:
-        Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
-            http://arxiv.org/pdf/1502.03167v3.pdf
-
-        mode: 0 -> featurewise normalization
-              1 -> samplewise normalization (may sometimes outperform featurewise mode)
-
-        momentum: momentum term in the computation of a running estimate of the mean and std of the data
-    '''
-    def __init__(self, input_shape, epsilon=1e-6, mode=0, momentum=0.9):
+
+    def __init__(self, input_shape, epsilon=1e-6, mode=0, gamma_init=UniformWeight(), memory=0.9):
+        '''
+        REFERENCE:
+            Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
+                                 http://arxiv.org/pdf/1502.03167v3.pdf
+        PARAMS:
+            memory:
+                y_t is the latest value, the moving average x_tp1 is calculated as
+                x_tp1 = memory * y_t + (1-memory) * x_t, the larger the memory, the
+                more weight is put on contempory.
+            epsilon:
+                denominator min value for preventing division by zero in computing std
+        '''
         self.input_shape = input_shape
         self.epsilon = epsilon
-        self.mode = mode
-        self.momentum = momentum
+        self.mem = memory
 
-        self.init = UniformWeight()
-        self.gamma = self.init((self.input_shape), name='gamma')
+        self.gamma = gamma_init(self.input_shape, name='gamma')
         self.beta = shared_zeros(self.input_shape, name='beta')
 
-        self.running_mean = None
-        self.running_std = None
+        self.moving_mean = 0
+        self.moving_std = 0
 
         self.params = [self.gamma, self.beta]
 
 
     def _train_fprop(self, state_below):
+        miu = state_below.mean(axis=0)
+        std = T.std(state_below, axis=0)
+        Z = (state_below - miu) / (std + self.epsilon)
 
-        if self.mode == 0:
-            m = state_below.mean(axis=0)
-            # manual computation of std to prevent NaNs
-            std = T.mean((state_below-m)**2 + self.epsilon, axis=0) ** 0.5
-            X_normed = (state_below - m) / (std + self.epsilon)
-
-            if self.running_mean is None:
-                self.running_mean = m
-                self.running_std = std
-            else:
-                self.running_mean *= self.momentum
-                self.running_mean += (1-self.momentum) * m
-                self.running_std *= self.momentum
-                self.running_std += (1-self.momentum) * std
-
-        elif self.mode == 1:
-            m = state_below.mean(axis=-1, keepdims=True)
-            std = state_below.std(axis=-1, keepdims=True)
-            X_normed = (state_below - m) / (std + self.epsilon)
+        self.moving_mean += self.mem * miu + (1-self.mem) * self.moving_mean
+        self.moving_std += self.mem * std + (1-self.mem) * self.moving_std
 
-        return self.gamma * X_normed + self.beta
+        return self.gamma * Z + self.beta
 
 
     def _test_fprop(self, state_below):
-
-        if self.mode == 0:
-            X_normed = (state_below - self.running_mean) / (self.running_std + self.epsilon)
-
-        elif self.mode == 1:
-            m = state_below.mean(axis=-1, keepdims=True)
-            std = state_below.std(axis=-1, keepdims=True)
-            X_normed = (state_below - m) / (std + self.epsilon)
-
-        return self.gamma * X_normed + self.beta
+        Z = (state_below - self.moving_mean) / (self.moving_std + self.epsilon)
+        return self.gamma * Z + self.beta
 
 
 class LRN(Template):
     """
+    Adapted from pylearn2
     Local Response Normalization
     """
 
diff --git a/mozi/weight_init.py b/mozi/weight_init.py
@@ -24,20 +24,18 @@ def __init__(self, mean=0, std=0.1):
         self.std = std
 
     def __call__(self, dim, name='W'):
-        W_values = np.asarray(np.random.normal(loc = self.mean, scale = self.std,
-                              size = dim),
-                              dtype = floatX)
+        W_values = np.random.normal(loc=self.mean, scale=self.std, size=dim).astype(floatX)
+
         return theano.shared(name=name, value=W_values, borrow=True)
 
 
 class XavierUniformWeight(WeightInitialization):
     def __call__(self, dim, name='W'):
         fan_in, fan_out = get_fans(dim)
-        W_values = np.asarray(np.random.uniform(
-                            low = -4 * np.sqrt(6. / (fan_in + fan_out)),
-                            high = 4 * np.sqrt(6. / (fan_in + fan_out)),
-                            size = dim),
-                            dtype = floatX)
+        W_values = np.random.uniform(low = -4 * np.sqrt(6. / (fan_in + fan_out)),
+                                     high = 4 * np.sqrt(6. / (fan_in + fan_out)),
+                                     size = dim).astype(floatX)
+
 
         return theano.shared(name=name, value=W_values, borrow=True)
 
@@ -47,5 +45,5 @@ def __init__(self, scale=0.05):
         self.scale = scale
 
     def __call__(self, dim, name='W'):
-        W_values = np.random.uniform(low=-self.scale, high=self.scale, size=dim)
+        W_values = np.random.uniform(low=-self.scale, high=self.scale, size=dim).astype(floatX)
         return theano.shared(name=name, value=W_values, borrow=True)