Skip to content

Commit 9795e01

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 2daa05c + 2c5a6ac commit 9795e01

34 files changed

+880
-297
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
33
project(paddle CXX C)
44
set(PADDLE_MAJOR_VERSION 0)
55
set(PADDLE_MINOR_VERSION 8)
6-
set(PADDLE_PATCH_VERSION 0b0)
6+
set(PADDLE_PATCH_VERSION 0b1)
77
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
88

99
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -15,7 +15,7 @@ find_package(Protobuf REQUIRED)
1515
find_package(PythonLibs 2.7 REQUIRED)
1616
find_package(PythonInterp 2.7 REQUIRED)
1717
find_package(ZLIB REQUIRED)
18-
find_package(NumPy)
18+
find_package(NumPy REQUIRED)
1919
find_package(Threads REQUIRED)
2020
find_package(Glog)
2121
find_package(Gflags QUIET)

cmake/util.cmake

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,9 @@ function(link_paddle_exe TARGET_NAME)
104104
${PROTOBUF_LIBRARY}
105105
${CMAKE_THREAD_LIBS_INIT}
106106
${CBLAS_LIBS}
107-
${INTERAL_LIBS}
108107
${ZLIB_LIBRARIES}
109-
${CMAKE_DL_LIBS}
110-
)
108+
${INTERAL_LIBS}
109+
${CMAKE_DL_LIBS})
111110

112111
if(WITH_PYTHON)
113112
target_link_libraries(${TARGET_NAME}

demo/seqToseq/seqToseq_net.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,16 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
128128
return out
129129

130130
decoder_group_name = "decoder_group"
131+
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
132+
StaticInput(input=encoded_proj,is_seq=True)]
133+
131134
if not is_generating:
132135
trg_embedding = embedding_layer(
133136
input=data_layer(name='target_language_word',
134137
size=target_dict_dim),
135138
size=word_vector_dim,
136139
param_attr=ParamAttr(name='_target_language_embedding'))
140+
group_inputs.append(trg_embedding)
137141

138142
# For decoder equipped with attention mechanism, in training,
139143
# target embeding (the groudtruth) is the data input,
@@ -142,22 +146,13 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
142146
# for the recurrent_group.
143147
decoder = recurrent_group(name=decoder_group_name,
144148
step=gru_decoder_with_attention,
145-
input=[
146-
StaticInput(input=encoded_vector,
147-
is_seq=True),
148-
StaticInput(input=encoded_proj,
149-
is_seq=True), trg_embedding
150-
])
149+
input=group_inputs)
151150

152151
lbl = data_layer(name='target_language_next_word',
153152
size=target_dict_dim)
154-
cost = classification_cost(input=decoder, label=lbl, )
153+
cost = classification_cost(input=decoder, label=lbl)
155154
outputs(cost)
156155
else:
157-
gen_inputs = [StaticInput(input=encoded_vector,
158-
is_seq=True),
159-
StaticInput(input=encoded_proj,
160-
is_seq=True), ]
161156
# In generation, the decoder predicts a next target word based on
162157
# the encoded source sequence and the last generated target word.
163158

@@ -171,10 +166,11 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
171166
size=target_dict_dim,
172167
embedding_name='_target_language_embedding',
173168
embedding_size=word_vector_dim)
174-
gen_inputs.append(trg_embedding)
169+
group_inputs.append(trg_embedding)
170+
175171
beam_gen = beam_search(name=decoder_group_name,
176172
step=gru_decoder_with_attention,
177-
input=gen_inputs,
173+
input=group_inputs,
178174
id_input=data_layer(name="sent_id",
179175
size=1),
180176
dict_file=trg_dict_path,

doc/build/contribute_to_paddle.md

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,12 @@ repo or just head straight to the command line:
2525

2626
```shell
2727
# Clone your fork to your local machine
28-
git clone [email protected]:USERNAME/Paddle.git
28+
git clone https://github.com/USERNAME/Paddle.git
29+
```
30+
Then you can start to develop by making a local developement branch
31+
```shell
32+
git checkout -b MY_COOL_STUFF_BRANCH origin/master
2933
```
30-
Then you can start to develop.
3134

3235
## Commit
3336

@@ -45,7 +48,7 @@ are the details if any.
4548

4649
## Keeping Fork Up to Date
4750

48-
Before pull your request, you shold sync you code from the latest PaddlePaddle.
51+
Before pull your request, you should sync your code from the latest PaddlePaddle.
4952
To do this, you'll need to add a remote at first:
5053

5154
```shell
@@ -60,8 +63,7 @@ git remote -v
6063
Update your fork with the latest upstream changes:
6164

6265
```shell
63-
git fetch upstream
64-
git pull upstream master
66+
git pull --rebase upstream HEAD
6567
```
6668

6769
If there are no unique commits locally, git will simply perform a fast-forward.
@@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream.
7476

7577
```shell
7678
# push to your repository in Github
77-
git push origin master
79+
git push origin HEAD
7880
```
7981

8082
## Pull Request
8183

8284
Go to the page for your fork on GitHub, select your development branch,
8385
and click the **pull request button**.
86+
87+
## Update your pull request with the lastest version
88+
89+
During the code review, your pull request may become stale because new commits in
90+
baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
91+
by clicking the "Update Branch" button in your pull request page. However, in the case
92+
of conflict, you need to do the update manually. You need to do the following on
93+
your local repository:
94+
```shell
95+
git checkout MY_COOL_STUFF_BRANCH
96+
git pull --rebase upstream HEAD
97+
# You may need to resolve the conflict according to the git prompt.
98+
# Make and test your code.
99+
git push -f origin HEAD
100+
```
101+
Now your Pull Request is updated with the latest version.

doc/ui/api/trainer_config_helpers/activations.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ AbsActivation
1212
:members: AbsActivation
1313
:noindex:
1414

15+
ExpActivation
16+
===============
17+
18+
.. automodule:: paddle.trainer_config_helpers.activations
19+
:members: ExpActivation
20+
:noindex:
21+
1522
IdentityActivation
1623
==================
1724

doc/ui/data_provider/pydataprovider2.rst

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ A small part of the original data as an example is shown as below:
2424

2525
.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
2626

27-
Each line of the data contains two parts, separated by ';'. The first part is
27+
Each line of the data contains two parts, separated by :code:`;`. The first part is
2828
label of an image. The second part contains 28x28 pixel float values.
2929

3030
Just write path of the above data into train.list. It looks like this:
@@ -74,7 +74,20 @@ you can take this as an example.
7474

7575
.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
7676

77-
Here we specify training data by 'train.list', and no testing data is specified.
77+
Here we specify training data by :code:`train.list`, and no testing data is specified.
78+
The method which actually provide data is :code:`process`.
79+
80+
User also can use another style to provide data, which defines the
81+
:code:`data_layer`'s name explicitly when `yield`. For example,
82+
the :code:`dataprovider` is shown as below.
83+
84+
.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
85+
:linenos:
86+
87+
If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
88+
the order of :code:`data_layer` definition roughly to determine which feature to
89+
which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
90+
:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
7891

7992
Now, this simple example of using PyDataProvider is finished.
8093
The only thing that the user should know is how to generte **one sample** from
@@ -93,7 +106,7 @@ DataProvider for the sequential model
93106
-------------------------------------
94107
A sequence model takes sequences as its input. A sequence is made up of several
95108
timesteps. The so-called timestep, is not necessary to have something to do
96-
with 'time'. It can also be explained to that the order of data are taken into
109+
with time. It can also be explained to that the order of data are taken into
97110
consideration into model design and training.
98111
For example, the sentence can be interpreted as a kind of sequence data in NLP
99112
tasks.
@@ -155,23 +168,7 @@ Reference
155168
@provider
156169
+++++++++
157170

158-
'@provider' is a Python `Decorator`_, it can construct a PyDataProvider in
159-
PaddlePaddle from a user defined function. Its parameters are:
160-
161-
* `input_types`_ defines format of the data input.
162-
* should_shuffle defines whether to shuffle data or not. By default, it is set
163-
true during training, and false during testing.
164-
* pool_size is the memory pool size (in sample number) in DataProvider.
165-
-1 means no limit.
166-
* can_over_batch_size defines whether PaddlePaddle can store little more
167-
samples than pool_size. It is better to set True to avoid some deadlocks.
168-
* calc_batch_size is a function define how to calculate batch size. This is
169-
usefull in sequential model, that defines batch size is counted upon sequence
170-
or token. By default, each sample or sequence counts to 1 when calculating
171-
batch size.
172-
* cache is a data cache strategy, see `cache`_.
173-
* Init_hook function is invoked once the data provider is initialized,
174-
see `init_hook`_.
171+
.. autofunction:: paddle.trainer.PyDataProvider2.provider
175172

176173
input_types
177174
+++++++++++

doc_cn/ui/data_provider/mnist_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@
44
test_list=None,
55
module='mnist_provider',
66
obj='process')
7+
img = data_layer(name='pixel', size=784)
8+
label = data_layer(name='label', size=10)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from paddle.trainer.PyDataProvider2 import *
2+
3+
4+
# Define a py data provider
5+
@provider(input_types=[
6+
dense_vector(28 * 28),
7+
integer_value(10)
8+
])
9+
def process(settings, filename): # settings is not used currently.
10+
f = open(filename, 'r') # open one of training file
11+
12+
for line in f: # read each line
13+
label, pixel = line.split(';')
14+
15+
# get features and label
16+
pixels_str = pixel.split(' ')
17+
18+
pixels_float = []
19+
for each_pixel_str in pixels_str:
20+
pixels_float.append(float(each_pixel_str))
21+
22+
# give data to paddle.
23+
yield { "pixel": pixels_float, 'label': int(label) }
24+
25+
f.close() # close file

doc_cn/ui/data_provider/pydataprovider2.rst

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,14 @@ process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一
5656
这里说明了训练数据是 'train.list',而没有测试数据。引用的DataProvider是 'mnist_provider'
5757
这个模块中的 'process' 函数。
5858

59+
同时,根据模型配置文件中 :code:`data_layer` 的名字,用户也可以显式指定返回的数据对应关系。例如:
60+
61+
.. literalinclude:: mnist_provider.dict.py
62+
:linenos:
63+
64+
如果用户不指定返回数据的对应关系,那么PaddlePaddle会粗略的根据layer的声明顺序,
65+
来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
66+
5967
至此,简单的PyDataProvider样例就说明完毕了。对于用户来说,讲数据发送给PaddlePaddle,仅仅需要
6068
知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
6169

@@ -119,18 +127,25 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
119127
@provider
120128
+++++++++
121129

122-
'@provider'是一个Python的 `Decorator`_ ,他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
130+
:code:`@provider` 是一个Python的 `Decorator`_ ,他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
123131

124132
* `input_types`_ 是数据输入格式。具体有哪些格式,参考 `input_types`_ 。
125133
* should_shuffle 是个DataProvider是不是要做shuffle,如果不设置的话,训练的时候默认shuffle,
126-
测试的时候默认不shuffle
134+
测试的时候默认不shuffle。
135+
* min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
136+
设置成-1的话,会预先读取全部数据到内存中。
127137
* pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话,即不在乎内存暂存多少条数据。
128138
* can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
129139
一般推荐设置成True
130140
* calc_batch_size 传入的是一个函数,这个函数以一条数据为参数,返回batch_size的大小。默认情况下一条数据
131141
是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size
132142
* cache 是数据缓存的策略,参考 `cache`_
133143
* init_hook 是初始化时调用的函数,参考 `init_hook`_
144+
* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以
145+
返回一个list或者tuple。如果是false的话,只能够返回list或者tuple
146+
* check 设置成true的话,会根据input_types检查数据的合法性。
147+
* check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果
148+
check是false的话,没有作用。
134149

135150
input_types
136151
+++++++++++
@@ -190,3 +205,55 @@ DataProvider提供了两种简单的Cache策略。他们是
190205
* CacheType.NO_CACHE 不缓存任何数据,每次都会从python端读取数据
191206
* CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据,剩下的pass会直接从内存里
192207
读取数据。
208+
209+
210+
注意事项
211+
--------
212+
213+
可能的内存泄露问题
214+
++++++++++++++++++
215+
216+
PaddlePaddle将train.list中的每一行,都传递给process函数,从而生成多个generator。
217+
即如果train.list中,有100个训练文件,即会生成100个generator。这个本身不是一个很
218+
严重的问题。
219+
220+
但是,如果在训练时,每一条训练数据都是一个文件,并且,训练数据非常多的情况下,就
221+
会生成多个generator。每个generator在没有调用的时候,是几乎不占内存的。但是,当调
222+
用过一次的时候,generator便会存下当前的上下文(Context)。而这个Context可能会非常
223+
大。并且,generator至少调用两次才会知道是否停止。所以,即使在process里面只会有一
224+
个yield,也需要两次随机选择到同样的generator的时候,才会释放该段内存。
225+
226+
.. code-block:: python
227+
228+
def func():
229+
yield 0
230+
231+
f = func() # 创建generator
232+
tmp = next(f) # 调用一次,返回0
233+
tmp = next(f) # 调用第二次的时候,才会Stop Iteration
234+
235+
而如果按顺序调用这些generator就不会出现这个问题。
236+
237+
所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
238+
文件,train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
239+
下非常少的变量引用。例如
240+
241+
.. code-block:: python
242+
243+
def real_process(fn):
244+
# ... read from fn
245+
return result # 当函数返回的时候,python可以解除掉内部变量的引用。
246+
247+
def process(fn):
248+
yield real_process(fn)
249+
250+
这个问题是PyDataProvider读数据时候的逻辑问题,基本上不能整体修正。
251+
252+
253+
内存不够用的情况
254+
++++++++++++++++
255+
256+
PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器,推荐设置
257+
:code:`pool_size` 变量,而这个变量推荐大于训练的batch size,并且在内存足够
258+
的情况下越大越好。
259+

paddle/gserver/dataproviders/DataProvider.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,13 @@ void DoubleBuffer::startAsyncLoad() {
149149
taskReadySem_.post();
150150
}
151151

152-
ClassRegistrar<DataProvider, DataConfig, bool> DataProvider::registrar_;
153-
DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) {
154-
return registrar_.createByType(config.type(), config, useGpu);
152+
ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
153+
DataProvider::registrar_;
154+
155+
DataProvider* DataProvider::create(const DataConfig& config,
156+
const ModelConfig& modelConfig,
157+
bool useGpu) {
158+
return registrar_.createByType(config.type(), config, modelConfig, useGpu);
155159
}
156160

157161
REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);

0 commit comments

Comments
 (0)