PaddlePaddle
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 7 additions & 0 deletions b/‎Dockerfile
Lines changed: 7 additions & 0 deletions
diff --git a/‎Dockerfile.android
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.android
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/scripts/check_env.sh renamed to ‎benchmark/paddle/image/check_env.sh b/‎paddle/scripts/check_env.sh renamed to ‎benchmark/paddle/image/check_env.sh
diff --git a/‎cmake/configure.cmake
Lines changed: 10 additions & 0 deletions b/‎cmake/configure.cmake
Lines changed: 10 additions & 0 deletions
diff --git a/‎cmake/tensorrt.cmake
Lines changed: 33 additions & 0 deletions b/‎cmake/tensorrt.cmake
Lines changed: 33 additions & 0 deletions
diff --git a/‎doc/CMakeLists.txt
Lines changed: 3 additions & 1 deletion b/‎doc/CMakeLists.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/fluid/api/initializer.rst
Lines changed: 42 additions & 0 deletions b/‎doc/fluid/api/initializer.rst
Lines changed: 42 additions & 0 deletions
diff --git a/‎doc/fluid/api/layers.rst
Lines changed: 5 additions & 0 deletions b/‎doc/fluid/api/layers.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎doc/fluid/design/concepts/parallel_executor.md
Lines changed: 1 addition & 1 deletion b/‎doc/fluid/design/concepts/parallel_executor.md
Lines changed: 1 addition & 1 deletion
@@ -179,6 +179,7 @@ set(EXTERNAL_LIBS
 
 if(WITH_GPU)
     include(cuda)
+    include(tensorrt)
 endif(WITH_GPU)
 
 if(WITH_AMD_GPU)
 
@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
 
+# Install TensorRT
+# The unnecessary files has been removed to make the library small. It only contains include and lib now.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
 
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit
 
@@ -80,6 +80,16 @@ if(WITH_GPU)
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+    if(TENSORRT_FOUND)
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        endif()
+        include_directories(${TENSORRT_INCLUDE_DIR})
+    endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
 
@@ -0,0 +1,33 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+    set(TENSORRT_FOUND ON)
+else()
+    set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+        "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+endif()
@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
 
 add_custom_target(paddle_docs ALL
                   DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn)
+                  paddle_fluid_docs paddle_fluid_docs_cn
+                  paddle_mobile_docs paddle_mobile_docs_cn)
 
 add_subdirectory(v2)
 add_subdirectory(fluid)
+add_subdirectory(mobile)
@@ -33,3 +33,45 @@ Xavier
     :members:
     :noindex:
 
+MSRA
+------
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+    MSRA
+    ------
+
+MSRAInitializer
+-----------------
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
@@ -815,3 +815,8 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an `
 
 ## Synchronize GPU Kernels
 
-The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
 
 1. `OpHandle` will record `DeviceContext` that it is used.
 2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.