PaddlePaddle
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/util.cmake
Lines changed: 2 additions & 3 deletions b/‎cmake/util.cmake
Lines changed: 2 additions & 3 deletions
diff --git a/‎demo/seqToseq/seqToseq_net.py
Lines changed: 9 additions & 13 deletions b/‎demo/seqToseq/seqToseq_net.py
Lines changed: 9 additions & 13 deletions
diff --git a/‎doc/build/contribute_to_paddle.md
Lines changed: 24 additions & 6 deletions b/‎doc/build/contribute_to_paddle.md
Lines changed: 24 additions & 6 deletions
diff --git a/‎doc/ui/api/trainer_config_helpers/activations.rst
Lines changed: 7 additions & 0 deletions b/‎doc/ui/api/trainer_config_helpers/activations.rst
Lines changed: 7 additions & 0 deletions
diff --git a/‎doc/ui/data_provider/pydataprovider2.rst
Lines changed: 17 additions & 20 deletions b/‎doc/ui/data_provider/pydataprovider2.rst
Lines changed: 17 additions & 20 deletions
diff --git a/‎doc_cn/ui/data_provider/mnist_config.py
Lines changed: 2 additions & 0 deletions b/‎doc_cn/ui/data_provider/mnist_config.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc_cn/ui/data_provider/mnist_provider.dict.py
Lines changed: 25 additions & 0 deletions b/‎doc_cn/ui/data_provider/mnist_provider.dict.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎doc_cn/ui/data_provider/pydataprovider2.rst
Lines changed: 69 additions & 2 deletions b/‎doc_cn/ui/data_provider/pydataprovider2.rst
Lines changed: 69 additions & 2 deletions
diff --git a/‎paddle/gserver/dataproviders/DataProvider.cpp
Lines changed: 7 additions & 3 deletions b/‎paddle/gserver/dataproviders/DataProvider.cpp
Lines changed: 7 additions & 3 deletions
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b0)
+set(PADDLE_PATCH_VERSION 0b1)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -15,7 +15,7 @@ find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
 find_package(ZLIB REQUIRED)
-find_package(NumPy)
+find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
 find_package(Glog)
 find_package(Gflags QUIET)
 
@@ -104,10 +104,9 @@ function(link_paddle_exe TARGET_NAME)
         ${PROTOBUF_LIBRARY}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
-        ${INTERAL_LIBS}
         ${ZLIB_LIBRARIES}
-        ${CMAKE_DL_LIBS}
-        )
+        ${INTERAL_LIBS}
+        ${CMAKE_DL_LIBS})
 
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
 
@@ -128,12 +128,16 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         return out
 
     decoder_group_name = "decoder_group"
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+
     if not is_generating:
         trg_embedding = embedding_layer(
             input=data_layer(name='target_language_word',
                              size=target_dict_dim),
             size=word_vector_dim,
             param_attr=ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
 
         # For decoder equipped with attention mechanism, in training,
         # target embeding (the groudtruth) is the data input,
@@ -142,22 +146,13 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         # for the recurrent_group.
         decoder = recurrent_group(name=decoder_group_name,
                                   step=gru_decoder_with_attention,
-                                  input=[
-                                      StaticInput(input=encoded_vector,
-                                                  is_seq=True),
-                                      StaticInput(input=encoded_proj,
-                                                  is_seq=True), trg_embedding
-                                  ])
+                                  input=group_inputs)
 
         lbl = data_layer(name='target_language_next_word',
                          size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl, )
+        cost = classification_cost(input=decoder, label=lbl)
         outputs(cost)
     else:
-        gen_inputs = [StaticInput(input=encoded_vector,
-                                  is_seq=True),
-                      StaticInput(input=encoded_proj,
-                                  is_seq=True), ]
         # In generation, the decoder predicts a next target word based on
         # the encoded source sequence and the last generated target word.
 
@@ -171,10 +166,11 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
             size=target_dict_dim,
             embedding_name='_target_language_embedding',
             embedding_size=word_vector_dim)
-        gen_inputs.append(trg_embedding)
+        group_inputs.append(trg_embedding)
+
         beam_gen = beam_search(name=decoder_group_name,
                                step=gru_decoder_with_attention,
-                               input=gen_inputs,
+                               input=group_inputs,
                                id_input=data_layer(name="sent_id",
                                                    size=1),
                                dict_file=trg_dict_path,
 
@@ -25,9 +25,12 @@ repo or just head straight to the command line:
 
 ```shell
 # Clone your fork to your local machine
-git clone [email protected]:USERNAME/Paddle.git
+git clone https://github.com/USERNAME/Paddle.git
+```
+Then you can start to develop by making a local developement branch
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH origin/master
 ```
-Then you can start to develop. 
 
 ## Commit
 
@@ -45,7 +48,7 @@ are the details if any.
 
 ## Keeping Fork Up to Date
 
-Before pull your request, you shold sync you code from the latest PaddlePaddle.
+Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:
 
 ```shell
@@ -60,8 +63,7 @@ git remote -v
 Update your fork with the latest upstream changes:
 
 ```shell
-git fetch upstream
-git pull upstream master
+git pull --rebase upstream HEAD
 ```
 
 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream.
 
 ```shell
 # push to your repository in Github
-git push origin master
+git push origin HEAD
 ```
 
 ## Pull Request
 
 Go to the page for your fork on GitHub, select your development branch,
 and click the **pull request button**.
+
+## Update your pull request with the lastest version
+
+During the code review, your pull request may become stale because new commits in
+baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
+by clicking the "Update Branch" button in your pull request page. However, in the case
+of conflict, you need to do the update manually. You need to do the following on
+your local repository:
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull --rebase upstream HEAD
+# You may need to resolve the conflict according to the git prompt.
+# Make and test your code.
+git push -f origin HEAD
+```
+Now your Pull Request is updated with the latest version.
@@ -12,6 +12,13 @@ AbsActivation
     :members: AbsActivation
     :noindex:
 
+ExpActivation
+===============
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: ExpActivation
+    :noindex:
+    
 IdentityActivation
 ==================
 
 
@@ -24,7 +24,7 @@ A small part of the original data as an example is shown as below:
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
 
-Each line of the data contains two parts, separated by ';'. The first part is
+Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.
 
 Just write path of the above data into train.list. It looks like this:
@@ -74,7 +74,20 @@ you can take this as an example.
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
 
-Here we specify training data by 'train.list', and no testing data is specified.
+Here we specify training data by :code:`train.list`, and no testing data is specified.
+The method which actually provide data is :code:`process`.
+
+User also can use another style to provide data, which defines the
+:code:`data_layer`'s name explicitly when `yield`. For example,
+the :code:`dataprovider` is shown as below.
+
+.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+   :linenos:
+
+If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
+the order of :code:`data_layer` definition roughly to determine which feature to
+which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
+:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
 
 Now, this simple example of using PyDataProvider is finished.
 The only thing that the user should know is how to generte **one sample** from
@@ -93,7 +106,7 @@ DataProvider for the sequential model
 -------------------------------------
 A sequence model takes sequences as its input. A sequence is made up of several
 timesteps. The so-called timestep, is not necessary to have something to do
-with 'time'. It can also be explained to that the order of data are taken into
+with time. It can also be explained to that the order of data are taken into
 consideration into model design and training.
 For example, the sentence can be interpreted as a kind of sequence data in NLP
 tasks.
@@ -155,23 +168,7 @@ Reference
 @provider
 +++++++++
 
-'@provider' is a Python `Decorator`_, it can construct a PyDataProvider in
-PaddlePaddle from a user defined function. Its parameters are:
-
-* `input_types`_ defines format of the data input.
-* should_shuffle defines whether to shuffle data or not. By default, it is set
-  true during training, and false during testing.
-* pool_size is the memory pool size (in sample number) in DataProvider.
-  -1 means no limit.
-* can_over_batch_size defines whether PaddlePaddle can store little more
-  samples than pool_size. It is better to set True to avoid some deadlocks.
-* calc_batch_size is a function define how to calculate batch size. This is
-  usefull in sequential model, that defines batch size is counted upon sequence
-  or token. By default, each sample or sequence counts to 1 when calculating
-  batch size.
-* cache is a data cache strategy, see `cache`_.
-* Init_hook function is invoked once the data provider is initialized,
-  see `init_hook`_.
+.. autofunction:: paddle.trainer.PyDataProvider2.provider
 
 input_types
 +++++++++++
 
@@ -4,3 +4,5 @@
                         test_list=None,
                         module='mnist_provider',
                         obj='process')
+img = data_layer(name='pixel', size=784)
+label = data_layer(name='label', size=10)
@@ -0,0 +1,25 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types=[
+    dense_vector(28 * 28),
+    integer_value(10)
+])
+def process(settings, filename):  # settings is not used currently.
+    f = open(filename, 'r')  # open one of training file
+
+    for line in f:  # read each line
+        label, pixel = line.split(';')
+
+        # get features and label
+        pixels_str = pixel.split(' ')
+
+        pixels_float = []
+        for each_pixel_str in pixels_str:
+            pixels_float.append(float(each_pixel_str))
+
+        # give data to paddle.
+        yield { "pixel": pixels_float, 'label': int(label) }
+
+    f.close()  # close file
@@ -56,6 +56,14 @@ process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一
 这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
 这个模块中的 'process' 函数。
 
+同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
+
+.. literalinclude:: mnist_provider.dict.py
+   :linenos:
+
+如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
+来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
+
 至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
 知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
 
@@ -119,18 +127,25 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
 @provider
 +++++++++
 
-'@provider'是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
+:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
 
 *  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
 *  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
-   测试的时候默认不shuffle
+   测试的时候默认不shuffle。
+*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
+   设置成-1的话，会预先读取全部数据到内存中。
 *  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
 *  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
    一般推荐设置成True
 *  calc_batch_size 传入的是一个函数，这个函数以一条数据为参数，返回batch_size的大小。默认情况下一条数据
    是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
+*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
+   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
+*  check 设置成true的话，会根据input_types检查数据的合法性。
+*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
+   check是false的话，没有作用。
 
 input_types
 +++++++++++
@@ -190,3 +205,55 @@ DataProvider提供了两种简单的Cache策略。他们是
 * CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
 * CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
   读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
+即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
+严重的问题。
+
+但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
+会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
+用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
+大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
+个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+而如果按顺序调用这些generator就不会出现这个问题。
+
+所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
+文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
+下非常少的变量引用。例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
+
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
+:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
+的情况下越大越好。
+
@@ -149,9 +149,13 @@ void DoubleBuffer::startAsyncLoad() {
   taskReadySem_.post();
 }
 
-ClassRegistrar<DataProvider, DataConfig, bool> DataProvider::registrar_;
-DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) {
-  return registrar_.createByType(config.type(), config, useGpu);
+ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
+DataProvider::registrar_;
+
+DataProvider* DataProvider::create(const DataConfig& config,
+                                   const ModelConfig& modelConfig,
+                                   bool useGpu) {
+  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
 }
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);