Skip to content

Commit 5b4f283

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_data_balance
2 parents b6dc3a5 + fa1fb12 commit 5b4f283

File tree

127 files changed

+3506
-1048
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

127 files changed

+3506
-1048
lines changed

benchmark/fluid/args.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,5 +122,9 @@ def parse_args():
122122
type=str,
123123
default="",
124124
help='Directory that contains all the training recordio files.')
125+
parser.add_argument(
126+
'--use_inference_transpiler',
127+
action='store_true',
128+
help='If set, uses inference transpiler to optimize the program.')
125129
args = parser.parse_args()
126130
return args

benchmark/fluid/fluid_benchmark.py

100644100755
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
131131
exe = fluid.Executor(place)
132132
exe.run(startup_prog)
133133

134+
# Use inference_transpiler to speedup
135+
if args.use_inference_transpiler:
136+
t = fluid.InferenceTranspiler()
137+
t.transpile(infer_prog, place)
138+
134139
if not args.use_reader_op:
135140
feed_var_list = [
136141
var for var in train_prog.global_block().vars.itervalues()

cmake/external/anakin.cmake

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
2626
endforeach()
2727
endfunction()
2828

29-
# download library
30-
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
31-
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
32-
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
33-
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
34-
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
35-
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
29+
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
30+
# download library
31+
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
32+
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
33+
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
34+
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
35+
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
36+
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
37+
endif()
3638

3739
if (WITH_ANAKIN)
3840
message(STATUS "Anakin for inference is enabled")

cmake/inference_lib.cmake

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -149,21 +149,33 @@ copy(memory_lib
149149
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
150150
)
151151

152-
set(module "inference")
153-
copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
154-
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
155-
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
156-
)
152+
set(inference_deps paddle_fluid_shared paddle_fluid)
157153

158154
if(WITH_CONTRIB)
159-
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
160-
copy(contrib_inference_lib DEPS paddle_inference_api
155+
message(STATUS "installing contrib")
156+
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
157+
if (WITH_ANAKIN)
158+
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
159+
SRCS
160+
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
161+
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
162+
DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
163+
list(APPEND inference_deps contrib_anakin_inference_lib)
164+
endif()
165+
166+
copy(contrib_inference_lib DEPS paddle_inference_api
161167
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
162168
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
163-
DSTS ${contrib_dst_dir} ${contrib_dst_dir}
164-
)
169+
DSTS ${contrib_dst_dir} ${contrib_dst_dir})
170+
list(APPEND inference_deps contrib_inference_lib)
165171
endif()
166172

173+
set(module "inference")
174+
copy(inference_lib DEPS ${inference_deps}
175+
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
176+
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
177+
)
178+
167179
set(module "platform")
168180
copy(platform_lib DEPS profiler_py_proto
169181
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h

doc/about/about_us.rst

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
=========
2+
关于我们
3+
=========
4+
5+
什么是PaddlePaddle
6+
--------------------
7+
8+
- PaddlePaddle是百度自主研发并开源的深度学习框架,它能够让开发者和企业安全、快速地实现自己的AI想法
9+
10+
- 项目团队汇聚了全球顶级的深度学习科学家,致力于为开发者和企业提供最好的深度学习研发体验
11+
12+
- 框架具有易学、易用、安全、高效四大特性,是最适合中国开发者和企业的深度学习工具
13+
14+
PaddlePaddle的技术特色
15+
-------------------------
16+
17+
- 新一代深度学习框架: PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架,在保证性能的同时,极大的提升了框架对模型的表达能力,能够描述任意潜在可能出现的模型
18+
19+
- 对大规模计算更加友好:经过百度内多种大规模计算业务的打磨,PaddlePaddle在分布式计算上表现优异,基于EDL技术能够节约大量计算资源,同时也能支持大规模稀疏模型的训练
20+
21+
- 提供可视化的深度学习:通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构,帮助开发者更便捷的完成编程过程
22+
23+
提供基于PaddlePaddle的教育体系
24+
--------------------------------
25+
26+
- 深度学习课程:百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材,帮助开发者从零掌握深度学习
27+
28+
- 深度学习实训:对于目的是科研和学习的用户,PaddlePaddle提供了无需安装、线上运行的开发环境,并提供算法、算力、数据支持
29+
30+
- 线下培训:提供丰富、高质量的线下教育活动,如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
31+
32+
33+
提供基于PaddlePaddle的AI服务
34+
------------------------------
35+
36+
- EadyDL:可以帮助零算法基础的企业快速完成一个深度学习任务,只需少量的数据即可得到优质的模型
37+
38+
- AI市场:提供标准化的AI 能力、产品的交易机制,帮助企业快速找到所需,有效开展AI业务
39+
40+
- 深度学习竞赛: PaddlePaddle汇聚顶尖深度学习开发者,企业可以发布自己的商业问题,通过竞赛方式快速找到最优的解决方案
41+
42+
你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
43+
-----------------------------------------------------------
44+
45+
- 学习/使用问题:可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_,以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
46+
47+
- 对PaddlePaddle框架发展的建议:可发送邮件至[email protected]
48+
49+
我们期待与你一起打造世界顶级深度学习框架,共同推动AI技术的进步
50+
51+
52+
53+
PaddlePaddle团队

doc/fluid/design/concepts/lod_tensor.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
173173

174174
## Slicing of LoD Tensors
175175

176+
176177
When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
177178

178179
For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
189190
10 12
190191
||
191192
```
193+
194+
## Length Representation vs Offset Representation
195+
196+
The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
197+
Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API.
198+
Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
199+
```Python
200+
# length representation of lod called recursive_sequence_lengths
201+
recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
202+
# Create a LoDTensor that has the above recursive_sequence_lengths info.
203+
# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
204+
tensor = fluid.LoDTensor(lod)
205+
206+
# Set/Change the recursive_sequence_lengths info of LoDTensor
207+
tensor.set_recursive_sequence_lengths([[3, 1, 2]])
208+
# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted
209+
# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
210+
new_recursive_seq_lens = tensor.recursive_sequence_lengths()
211+
```
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# Python Data Feeding
2+
3+
In the former implementation of Paddle Fluid, there are two ways to feed data:
4+
5+
- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
6+
7+
- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
8+
9+
In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
10+
11+
12+
## Design of LoDTensorBlockingQueue
13+
`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
14+
15+
```C++
16+
class LoDTensorBlockingQueueHolder;
17+
18+
class LoDTensorBlockingQueue {
19+
friend class LoDTensorBlockingQueueHolder;
20+
private:
21+
// `LoDTensorBlockingQueue` can only be constructed by
22+
// `LoDTensorBlockingQueueHolder::InitOnce()`
23+
LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
24+
25+
public:
26+
size_t Size() const { return queue_.Size(); } // Get the current size of the queue
27+
28+
size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
29+
30+
void Close() { return queue_.Close(); }
31+
32+
bool IsClosed() const { return queue_.IsClosed(); }
33+
34+
// Block if Size() == Cap()
35+
// Return false only when queue_.IsClosed() == true
36+
bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
37+
38+
// Block if Size() == 0.
39+
// *Success == false when queue_.IsClosed() == true
40+
std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
41+
42+
private:
43+
// Use reader::BlockingQueue as the inner data structure
44+
BlockingQueue<std::vector<framework::LoDTensor>> queue_;
45+
std::vector<framework::DDim> dims_;
46+
};
47+
48+
class LoDTensorBlockingQueueHolder {
49+
public:
50+
// Call the constructor of `LoDTensorBlockingQueue` to create queue_
51+
// `InitOnce` can only called once, otherwise an exception would raise
52+
void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
53+
PADDLE_ENFORCE(queue_ == nullptr);
54+
queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
55+
}
56+
57+
const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
58+
59+
private:
60+
std::shared_ptr<LoDTensorBlockingQueue> queue_;
61+
};
62+
```
63+
64+
There are some major things that must be concerned:
65+
- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
66+
- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
67+
- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
68+
69+
70+
## Release of the GIL in pybind
71+
`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
72+
73+
74+
## Design of PyReader
75+
`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
76+
```C++
77+
class PyReader : public ReaderBase {
78+
public:
79+
explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
80+
81+
void ReadNext(std::vector<framework::LoDTensor>* out) override {
82+
bool success;
83+
*out = queue_->Pop(&success);
84+
if (!success) out->clear();
85+
}
86+
87+
void ReInit() override { return; }
88+
89+
private:
90+
std::shared_ptr<LoDTensorBlockingQueue> queue_;
91+
};
92+
```
93+
94+
95+
## Design of CreatePyReaderOp
96+
`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
97+
```C++
98+
class CreatePyReaderOp : public framework::OperatorBase {
99+
public:
100+
using framework::OperatorBase::OperatorBase;
101+
private:
102+
void RunImpl(const framework::Scope& scope,
103+
const platform::Place& dev_place) const override {
104+
auto* out = scope.FindVar(Output("Out"))
105+
->template GetMutable<framework::ReaderHolder>();
106+
if (out->Get() != nullptr) return;
107+
108+
const std::string& queue_name = Input("blocking_queue");
109+
auto* queue_holder_var = scope.FindVar(queue_name);
110+
PADDLE_ENFORCE(queue_holder_var != nullptr);
111+
auto* queue_holder = queue_holder_var
112+
->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
113+
out->Reset(new PyReader(queue_holder->GetQueue()));
114+
}
115+
};
116+
```
117+
118+
## Design of Python codes
119+
The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
120+
```Python
121+
def py_reader(capacity, shapes):
122+
queue_name = unique_name.generate("lod_tensor_blocking_queue")
123+
var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
124+
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
125+
out = create_var()
126+
create_py_reader_op_with_queue_name(
127+
inputs={'blocking_queue': queue_name},
128+
outputs={'Out':[out]})
129+
return out, feed_queue
130+
```

doc/fluid/howto/optimization/host_memory_profiling_cn.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 堆内存分析和优化
1+
# 堆内存分析和优化
22

33
计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
44

@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
2020

2121
对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
2222

23-
## 使用流程
24-
#### 环境
23+
## 环境
24+
2525
本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
2626

27-
#### 使用流程
27+
## 使用流程
2828

2929
- 安装google-perftools
3030

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# 如何使用timeline工具做性能分析
2+
3+
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
4+
5+
**提示:**
6+
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
7+
8+
```python
9+
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
10+
for pass_id in range(pass_num):
11+
for batch_id, data in enumerate(train_reader()):
12+
exe.run(fluid.default_main_program(),
13+
feed=feeder.feed(data),
14+
fetch_list=[])
15+
...
16+
```
17+
18+
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
19+
20+
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
21+
22+
![chrome tracing](./tracing.jpeg)
23+
24+
1. 结果如下图所示,可以放到来查看timetime的细节信息。
25+
26+
![chrome timeline](./timeline.jpeg)
File renamed without changes.

0 commit comments

Comments
 (0)