PaddlePaddle
diff --git a/‎benchmark/paddle/image/resnet.py
Lines changed: 213 additions & 0 deletions b/‎benchmark/paddle/image/resnet.py
Lines changed: 213 additions & 0 deletions
diff --git a/‎benchmark/paddle/image/run_mkldnn.sh
Lines changed: 14 additions & 16 deletions b/‎benchmark/paddle/image/run_mkldnn.sh
Lines changed: 14 additions & 16 deletions
diff --git a/‎doc/design/float16.md
Lines changed: 1 addition & 1 deletion b/‎doc/design/float16.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/framework/ddim.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/framework/ddim.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/framework/ddim.h
Lines changed: 1 addition & 1 deletion b/‎paddle/framework/ddim.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/framework/lod_rank_table.cc
Lines changed: 1 addition & 0 deletions b/‎paddle/framework/lod_rank_table.cc
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/framework/lod_tensor.cc
Lines changed: 31 additions & 19 deletions b/‎paddle/framework/lod_tensor.cc
Lines changed: 31 additions & 19 deletions
diff --git a/‎paddle/framework/lod_tensor.h
Lines changed: 5 additions & 4 deletions b/‎paddle/framework/lod_tensor.h
Lines changed: 5 additions & 4 deletions
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_test = get_config_arg("is_test", bool, False)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+
+#######################Network Configuration #############
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        "conv1",
+        input=img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+    # conv2_x: 56x56
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+
+    # conv3_x: 28x28
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
+
+    # conv4_x: 14x14
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
+
+    # conv5_x: 7x7
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 50:
+    resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+    resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+    resnet = deep_res_net(3, 8, 36, 3)
+else:
+    print("Wrong layer number.")
+
+lbl = data_layer(name="label", size=num_class)
+loss = cross_entropy(name='loss', input=resnet, label=lbl)
+inputs(img, lbl)
+outputs(loss)
@@ -5,22 +5,23 @@ function train() {
   export OMP_DYNAMIC="FALSE"
   export KMP_AFFINITY="granularity=fine,compact,0,0"
   topology=$1
-  bs=$2
-  use_mkldnn=$3
-  if [ $3 == "True" ]; then
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
     thread=1
-    log="logs/${topology}-mkldnn-${bs}.log"
-  elif [ $3 == "False" ]; then
+    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
     thread=`nproc`
     # each trainer_count use only 1 core to avoid conflict
     export OMP_NUM_THREADS=1
     export MKL_NUM_THREADS=1
-    log="logs/${topology}-${thread}mklml-${bs}.log"
+    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
   else
     echo "Wrong input $3, use True or False."
     exit 0
   fi
-  args="batch_size=${bs}"
+  args="batch_size=${bs},layer_num=${layer_num}"
   config="${topology}.py"
   paddle train --job=time \
     --config=$config \
@@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then
   mkdir logs
 fi
 
-#========== mkldnn ==========#
-train vgg 64 True
-train vgg 128 True
-train vgg 256 True
-
-#========== mklml ===========#
-train vgg 64 False
-train vgg 128 False
-train vgg 256 False
+for use_mkldnn in True False; do
+  for batchsize in 64 128 256; do
+    train vgg 19 $batchsize $use_mkldnn
+    train resnet 50  $batchsize $use_mkldnn
+  done
+done
@@ -55,6 +55,6 @@ After float16 class is available, some of the future items are below:
 
 - Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
 
-- Modify `IndicateDataType()` method in `framework/operator.h` to make it compatible with float16.
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
 
 - Create a type-casting operator that can convert the data type in tensor between float16 and other types.
@@ -124,7 +124,7 @@ int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
 
@@ -71,7 +71,7 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  int64_t size() const;
+  int size() const;
 };
 
 /**
 
@@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     TableItem item;
     item.index = i;
     item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
     items_.emplace_back(item);
   }
   // NOTE(yuyang18):
 
@@ -27,6 +27,20 @@
 namespace paddle {
 namespace framework {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    for (auto& i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
@@ -136,37 +150,35 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   ShareDataWith(Slice(begin, end));
 }
 
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset) {
-  lod_length->clear();
-  PADDLE_ENFORCE(start_idx < lod.size() - 1,
-                 "start_idx should be >= 0 and < lod.size() - 1.");
-  PADDLE_ENFORCE(end_idx < lod.size(),
-                 "end_idx should be >= 0 and < lod.size().");
-  PADDLE_ENFORCE_LE(start_idx, end_idx,
-                    "start_idx should be less than end_idx.");
-  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
     std::vector<size_t> level_lens;
     for (size_t i = start_idx; i < end_idx; ++i) {
       level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
     }
-    lod_length->emplace_back(level_lens);
+    sub_lod.emplace_back(level_lens);
     start_idx = lod[level_idx][start_idx];
     end_idx = lod[level_idx][end_idx];
   }
-  *start_offset = start_idx;
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
-  PADDLE_ENFORCE_EQ(
-      lod->size(), lod_length.size(),
+void AppendLoD(LoD* lod, const LoD& lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
   for (size_t i = 0; i < lod->size(); ++i) {
     auto& level = (*lod)[i];
-    if (level.empty()) {
-      level.push_back(0);
-    }
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
 
@@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
  */
 using LoD = std::vector<Vector<size_t>>;
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
 /*
  * Slice levels from a LoD.
  * NOTE the lowest level should always be the absolute offsets of the underlying
@@ -181,11 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   return tensor;
 }
 
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset);
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+void AppendLoD(LoD* lod, const LoD& lod_length);
 
 }  // namespace framework
 }  // namespace paddle
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ int64_t DDim::operator[](int idx) const {`
`124`	`124`	`return boost::apply_visitor(DynamicConstIndexer(idx), var);`
`125`	`125`	`}`
`126`	`126`
`127`		`-int64_t DDim::size() const { return arity(*this); }`
	`127`	`+int DDim::size() const { return arity(*this); }`
`128`	`128`
`129`	`129`	`bool DDim::operator==(DDim d) const {`
`130`	`130`	`if (var.which() != d.getVar().which()) {`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {`
`31`	`31`	`TableItem item;`
`32`	`32`	`item.index = i;`
`33`	`33`	`item.length = vec[i + 1] - vec[i];`
	`34`	`+ VLOG(10) << "Add item to rank table " << item.index << " " << item.length;`
`34`	`35`	`items_.emplace_back(item);`
`35`	`36`	`}`
`36`	`37`	`// NOTE(yuyang18):`