Add acc test to image classification (#5336)

jacquesqiao · web-flow · commit 906e2565a7ab · 2017-11-04T05:01:48.000+08:00
* add acc layer
* memory log level change from 3 to 10
* use gaussian random to init conv parameters
* use initializer
* fix import
* batch_norm use helper to create persistable var
* refine code
* train only 2 batches for test
* use g_program and g_init_program
* use XavierInitializer to init fc parameter
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
@@ -408,7 +408,6 @@ class OperatorWithKernel : public OperatorBase {
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
   virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
-    VLOG(3) << "Default IndicateDataType " << this->Type();
     auto& scope = ctx.scope();
     int data_type = -1;
     for (auto& input : this->inputs_) {
@@ -425,7 +424,6 @@ class OperatorWithKernel : public OperatorBase {
           }
           if (t != nullptr) {
             int tmp = static_cast<int>(ToDataType(t->type()));
-            VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
             PADDLE_ENFORCE(tmp == data_type || data_type == -1,
                            "DataType of Paddle Op %s must be the same.",
                            Type());
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
@@ -51,6 +51,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
     PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
 
+    const float epsilon = ctx->Attrs().Get<float>("epsilon");
+    PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
+    PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
+
     // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
     PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                       "Mean and MeanOut should share the same memory");
@@ -297,7 +301,6 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
 
   framework::DataType IndicateDataType(
       const framework::ExecutionContext &ctx) const override {
-    VLOG(3) << "IndicateDataType " << this->Type();
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py
@@ -112,9 +112,12 @@ def input_dtype(self, input_param_name='input'):
                 raise ValueError("Data Type mismatch")
         return dtype
 
-    def create_parameter(self, attr, shape, dtype, suffix='w'):
+    def create_parameter(self, attr, shape, dtype, suffix='w',
+                         initializer=None):
         # Deepcopy the attr so that parameters can be shared in program
         attr_copy = copy.deepcopy(attr)
+        if initializer is not None:
+            attr_copy['initializer'] = initializer
         if attr_copy['name'] is None:
             attr_copy['name'] = unique_name(".".join([self.name, suffix]))
         self.init_program.global_block().create_parameter(
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
@@ -1,8 +1,7 @@
-from paddle.v2.framework.layer_helper import LayerHelper, unique_name
 import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
-    Operator
-from paddle.v2.framework.initializer import ConstantInitializer
+from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, Operator
+from paddle.v2.framework.initializer import ConstantInitializer, NormalInitializer
+from paddle.v2.framework.layer_helper import LayerHelper, unique_name
 import re
 
 __all__ = [
@@ -344,8 +343,13 @@ def conv2d(input,
 
     input_shape = input.shape
     filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
     filter = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        initializer=NormalInitializer(0.0, std, 0))
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -420,7 +424,7 @@ def batch_norm(input,
                act=None,
                is_test=False,
                momentum=0.9,
-               epsilon=1e05,
+               epsilon=1e-05,
                param_attr=None,
                bias_attr=None,
                data_layout='NCHW',
@@ -438,27 +442,29 @@ def batch_norm(input,
         else:
             raise ValueError("unsupported data layout:" + data_layout)
 
-    def create_persistable_var(dtype, shape, initializer=None):
-        name = unique_name(".".join([helper.name, "xxxx"]))
-        var = init_program.global_block().create_var(
-            dtype=dtype, shape=shape, name=name, persistable=True)
-        if initializer is not None:
-            initializer(var, var.block)
-        return program.global_block().create_var(
-            name=name, dtype=dtype, shape=shape, persistable=True)
-
     param_shape = [channel_num]
 
     # create parameter
     scale = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype)
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        initializer=ConstantInitializer(1.0))
     bias = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype)
-
-    # create input
-    mean = create_persistable_var(dtype, param_shape, ConstantInitializer(0.0))
-    variance = create_persistable_var(dtype, param_shape,
-                                      ConstantInitializer(1.0))
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        initializer=ConstantInitializer(0.0))
+
+    mean = helper.create_global_variable(
+        dtype=input.data_type, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(
+        var=mean, initializer=ConstantInitializer(0.0))
+
+    variance = helper.create_global_variable(
+        dtype=input.data_type, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(
+        var=variance, initializer=ConstantInitializer(1.0))
 
     # create output
     # mean and mean_out share the same memory
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/framework/tests/test_image_classification_train.py
@@ -1,13 +1,12 @@
+import numpy as np
 import paddle.v2 as paddle
+import paddle.v2.framework.core as core
 import paddle.v2.framework.layers as layers
 import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program
 from paddle.v2.framework.executor import Executor
-
-import numpy as np
+from paddle.v2.framework.framework import g_init_program, g_program
+from paddle.v2.framework.initializer import XavierInitializer
 
 
 def resnet_cifar10(input, depth=32, program=None, init_program=None):
@@ -124,7 +123,7 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride, program,
     return pool
 
 
-def vgg16_bn_drop(input, program, init_program):
+def vgg16_bn_drop(input, program=None, init_program=None):
     def conv_block(input,
                    num_filter,
                    groups,
@@ -155,6 +154,7 @@ def conv_block(input,
     fc1 = layers.fc(input=drop,
                     size=512,
                     act=None,
+                    param_attr={"initializer": XavierInitializer()},
                     program=program,
                     init_program=init_program)
     reshape1 = layers.reshape(
@@ -169,46 +169,34 @@ def conv_block(input,
     fc2 = layers.fc(input=drop2,
                     size=512,
                     act=None,
+                    param_attr={"initializer": XavierInitializer()},
                     program=program,
                     init_program=init_program)
     return fc2
 
 
-init_program = Program()
-program = Program()
-
 classdim = 10
 data_shape = [3, 32, 32]
 
-images = layers.data(
-    name='pixel', shape=data_shape, data_type='float32', program=program)
-
-label = layers.data(
-    name='label',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
+images = layers.data(name='pixel', shape=data_shape, data_type='float32')
+label = layers.data(name='label', shape=[1], data_type='int64')
 
 # Add neural network config
 # option 1. resnet
-net = resnet_cifar10(images, 32, program, init_program)
+# net = resnet_cifar10(images, 32)
 # option 2. vgg
-# net = vgg16_bn_drop(images, program, init_program)
+net = vgg16_bn_drop(images)
 
 # print(program)
 
-predict = layers.fc(input=net,
-                    size=classdim,
-                    act='softmax',
-                    program=program,
-                    init_program=init_program)
-cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+predict = layers.fc(input=net, size=classdim, act='softmax')
+cost = layers.cross_entropy(input=predict, label=label)
+avg_cost = layers.mean(x=cost)
+accuracy = layers.accuracy(input=predict, label=label)
 
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, init_program)
+# optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+optimizer = optimizer.AdamOptimizer(learning_rate=0.001)
+opts = optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 128
 PASS_NUM = 1
@@ -221,7 +209,7 @@ def conv_block(input,
 place = core.CPUPlace()
 exe = Executor(place)
 
-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(g_init_program, feed={}, fetch_list=[])
 
 for pass_id in range(PASS_NUM):
     batch_id = 0
@@ -239,14 +227,15 @@ def conv_block(input,
         tensor_img.set(img_data, place)
         tensor_y.set(y_data, place)
 
-        outs = exe.run(program,
+        outs = exe.run(g_program,
                        feed={"pixel": tensor_img,
                              "label": tensor_y},
-                       fetch_list=[avg_cost])
+                       fetch_list=[avg_cost, accuracy])
 
         loss = np.array(outs[0])
+        acc = np.array(outs[1])
         print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-              " loss:" + str(loss))
+              " loss:" + str(loss) + " acc:" + str(acc))
         batch_id = batch_id + 1
 
         if batch_id > 1:
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
@@ -57,6 +57,8 @@
 cost = layers.cross_entropy(
     input=predict, label=label, program=program, init_program=init_program)
 avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+accuracy = layers.accuracy(
+    input=predict, label=label, program=program, init_program=init_program)
 
 optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
 opts = optimizer.minimize(avg_cost, init_program)
@@ -87,9 +89,9 @@
         outs = exe.run(program,
                        feed={'x': tensor_x,
                              'y': tensor_y},
-                       fetch_list=[avg_cost])
+                       fetch_list=[avg_cost, accuracy])
         out = np.array(outs[0])
-
+        acc = np.array(outs[1])
         if out[0] < 5.0:
             exit(0)  # if avg cost less than 5.0, we think our code is good.
 exit(1)