Skip to content

Commit de2d729

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into adadelta
2 parents 72847ad + 484cff6 commit de2d729

File tree

420 files changed

+8229
-2986
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

420 files changed

+8229
-2986
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ script:
5656
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
5757
export DOCS_DIR=`pwd`
5858
cd ..
59-
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2
59+
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
6060
notifications:
6161
email:
6262
on_success: change

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
5353
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
5454
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
5555
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option.
56-
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON)
56+
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
5757
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
5858
option(GLIDE_INSTALL "Download and install go dependencies " ON)
5959
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)

Dockerfile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
5353

5454
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
5555
# version util jupyter fixes this issue.
56+
57+
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
58+
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
59+
# version(1.7.1 for now), which causes building documentation failed.
5660
RUN pip install --upgrade pip && \
5761
pip install -U wheel && \
58-
pip install -U docopt PyYAML sphinx && \
59-
pip install -U sphinx-rtd-theme==0.1.9 recommonmark
62+
pip install -U docopt PyYAML sphinx==1.5.6 && \
63+
pip install sphinx-rtd-theme==0.1.9 recommonmark
6064

6165
RUN pip install pre-commit 'ipython==5.3.0' && \
6266
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \

cmake/external/openblas.cmake

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND})
7777
INSTALL_DIR ${CBLAS_INSTALL_DIR}
7878
BUILD_IN_SOURCE 1
7979
BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
80-
INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
80+
INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
81+
&& rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
8182
UPDATE_COMMAND ""
8283
CONFIGURE_COMMAND ""
8384
)
@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
100101
\"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
101102
)"
102103
)
103-
INSTALL(CODE "execute_process(
104-
COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
105-
${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
106-
)"
107-
)
108104
ENDIF()
109105
ENDIF(NOT ${CBLAS_FOUND})
110106

cmake/generic.cmake

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
186186
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
187187
else()
188188
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
189+
find_fluid_modules(${TARGET_NAME})
189190
endif()
191+
190192
if(cc_library_DEPS)
191193
# Don't need link libwarpctc.so
192194
if("${cc_library_DEPS};" MATCHES "warpctc;")
@@ -242,11 +244,11 @@ function(cc_test TARGET_NAME)
242244
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
243245
add_executable(${TARGET_NAME} ${cc_test_SRCS})
244246
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
245-
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
247+
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
246248
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
247249
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
248250
endif()
249-
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
251+
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
250252
add_test(NAME ${TARGET_NAME}
251253
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
252254
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
@@ -263,7 +265,8 @@ function(nv_library TARGET_NAME)
263265
if (nv_library_SHARED OR nv_library_shared) # build *.so
264266
cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
265267
else()
266-
cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
268+
cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
269+
find_fluid_modules(${TARGET_NAME})
267270
endif()
268271
if (nv_library_DEPS)
269272
add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -308,8 +311,8 @@ function(nv_test TARGET_NAME)
308311
set(multiValueArgs SRCS DEPS)
309312
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
310313
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
311-
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
312-
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
314+
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
315+
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
313316
add_test(${TARGET_NAME} ${TARGET_NAME})
314317
endif()
315318
endfunction(nv_test)

cmake/inference_lib.cmake

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
1+
set_property(GLOBAL PROPERTY FLUID_MODULES "")
2+
# find all fluid modules is used for paddle fluid static library
3+
function(find_fluid_modules TARGET_NAME)
4+
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
5+
string(FIND "${__target_path}" "fluid" pos)
6+
if(pos GREATER 1)
7+
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
8+
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
9+
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
10+
endif()
11+
endfunction(find_fluid_modules)
12+
113
# make package for paddle fluid shared and static library
214
function(copy TARGET)
315
set(options "")
416
set(oneValueArgs "")
517
set(multiValueArgs SRCS DSTS DEPS)
618
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
19+
set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
720

821
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
922
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -42,13 +55,21 @@ copy(glog_lib
4255
DSTS ${dst_dir} ${dst_dir}/lib
4356
)
4457

45-
IF(NOT PROTOBUF_FOUND)
58+
if(NOT PROTOBUF_FOUND)
4659
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
4760
copy(protobuf_lib
48-
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
61+
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
4962
DSTS ${dst_dir} ${dst_dir}/lib
5063
)
51-
ENDIF(NOT PROTOBUF_FOUND)
64+
endif()
65+
66+
if(NOT CBLAS_FOUND)
67+
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
68+
copy(openblas_lib
69+
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
70+
DSTS ${dst_dir} ${dst_dir}
71+
)
72+
endif()
5273

5374
# paddle fluid module
5475
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@@ -66,8 +87,8 @@ copy(memory_lib
6687
)
6788

6889
set(module "inference")
69-
copy(inference_lib DEPENDS paddle_fluid_shared
70-
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
90+
copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
91+
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
7192
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
7293
)
7394

@@ -83,6 +104,4 @@ copy(string_lib
83104
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
84105
)
85106

86-
add_custom_target(inference_lib_dist DEPENDS
87-
inference_lib framework_lib memory_lib platform_lib string_lib
88-
gflags_lib glog_lib protobuf_lib eigen3_lib)
107+
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})

doc/design/cpp_data_feeding.md

Lines changed: 0 additions & 79 deletions
This file was deleted.
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
## Design Doc: Distributed Lookup Table Operator
2+
3+
A lookup table operator in PaddlePaddle where the table could be out
4+
of the memory of a computer.
5+
6+
## Background
7+
8+
A lookup table operator is well-used in deep learning for learning the
9+
representation, or the
10+
[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
11+
symbols.
12+
13+
### The Forward Algorithm
14+
15+
The forward algorithm of the lookup table is a multiplication of the
16+
input vector x and the lookup table matrix W:
17+
18+
$$y = x * W$$
19+
20+
When x is a sparse vector of symbols, the above multiplication
21+
simplifies into looking up rows in W that correspond to symbols in x,
22+
denoted by W(x). Please be aware that W could be huge and out of the
23+
memory, so we'd need a distributed storage service, which supports the
24+
lookup of rows.
25+
26+
The following figure illustrates the multiplication of x with two
27+
non-zero elements, or say, two symbols, and a lookup table W:
28+
29+
![lookup table](./lookup_table.png)
30+
31+
### The Backward Algorithm
32+
33+
The backward algorithm computes W'(x) using W(x). W'(x) has the same
34+
scale of size as W(x) and is much smaller than W.
35+
36+
To optimize W given W', we can do simple SGD update:
37+
38+
$$W = f(W') = \lambda * W'$$
39+
40+
or some more sophisticated algorithms that rely on both W' and W:
41+
42+
$$W = f(W, W')$$
43+
44+
The following figure illustrates the backward pass of the lookup
45+
operator: ![lookup table training](./lookup_table_training.png)
46+
47+
## Distributed Storage Service
48+
49+
The forward algorithm requires a distributed storage service for W.
50+
The backward algorithm prefers that the storage system can apply the
51+
optimization algorithm on W. The following two sections describe two
52+
solutions -- the former doesn't require that the storage service can
53+
do optimization, the latter does.
54+
55+
### Storage Service Doesn't Optimize
56+
57+
In this design, we use highly-optimized distributed storage, e.g.,
58+
memcached, as the storage service, and we run the optimization
59+
algorithm on parameter servers of PaddlePaddle. The following figure
60+
illustrates the training process.
61+
62+
<!--
63+
Note: please update the following URL when update this digraph.
64+
<img src='https://g.gravizo.com/svg?
65+
digraph G {
66+
rankdir="LR";
67+
subgraph cluster1 {
68+
P1 [label="pserver 1"];
69+
P2 [label="pserver 2"];
70+
T1 [label="trainer 1"];
71+
T2 [label="trainer 2"];
72+
T3 [label="trainer 3"];
73+
}
74+
KV [label="memcached"];
75+
T1 -> P1;
76+
T1 -> P2;
77+
T2 -> P1;
78+
T2 -> P2;
79+
T3 -> P1;
80+
T3 -> P2;
81+
P1 -> KV [color=gray, weight=0.1];
82+
KV -> P1 [color=gray, weight=0.1];
83+
P2 -> KV [color=gray, weight=0.1];
84+
KV -> P2 [color=gray, weight=0.1];
85+
KV -> T1 [color=gray, weight=0.1];
86+
KV -> T2 [color=gray, weight=0.1];
87+
KV -> T3 [color=gray, weight=0.1];
88+
}
89+
)
90+
'/>
91+
-->
92+
93+
<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
94+
95+
Each trainer runs the forward and backward passes using their local
96+
data:
97+
98+
1. In the forward pass, when a trainer runs the forward algorithm of a
99+
lookup operator, it retrieves W(x) from the storage service.
100+
1. The trainer computes W'(x) in the backward pass using W(x).
101+
102+
During the global update process:
103+
104+
1. Each trainer uploads its W'(x) to parameter servers.
105+
1. The parameter server runs the optimization algorithm, e.g., the
106+
Adam optimization algorithm, which requires that
107+
1. The parameter server retrieves W(x) from memcached, and
108+
1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
109+
W'(x))$ to memcached, where $f$ denotes the optimization
110+
algorithm.
111+
112+
### Storage Service Does Optimize
113+
114+
This design is very similar to the above one, except that the
115+
optimization algorithm $f$ runs on the storage service.
116+
117+
- Pro: parameter servers do not retrieve W(x) from the storage
118+
service, thus saves half network communication.
119+
- Con: the storage service needs to be able to run the optimization
120+
algorithm.
121+
122+
## Conclusion
123+
124+
Let us do the "storage service does not optimize" solution first, as a
125+
baseline at least, because it is easier to use a well-optimized
126+
distributed storage service like memcached. We can do the "storage
127+
service does optimize" solution later or at the same time, which, if
128+
implemented carefully, should have better performance than the former.
-2.38 KB
Binary file not shown.

doc/design/images/duplicate_op.png

-21.4 KB
Binary file not shown.

0 commit comments

Comments
 (0)