Skip to content

Commit f173341

Browse files
committed
Merge remote-tracking branch 'baidu/develop' into feature/sppnet
2 parents dfbde28 + 05204af commit f173341

File tree

11 files changed

+101
-82
lines changed

11 files changed

+101
-82
lines changed

CMakeLists.txt

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,9 @@ else()
109109
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
110110

111111
if(WITH_AVX)
112-
if(AVX_FOUND)
113-
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -mavx")
114-
endif(AVX_FOUND)
112+
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
115113
else(WITH_AVX)
116-
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -msse3")
114+
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
117115
endif(WITH_AVX)
118116

119117
if(WITH_DSO)
@@ -138,11 +136,11 @@ if(NOT WITH_TIMER)
138136
endif(NOT WITH_TIMER)
139137

140138
if(WITH_AVX)
141-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
142-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
139+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
140+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
143141
else(WITH_AVX)
144-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
145-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
142+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
143+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
146144
endif(WITH_AVX)
147145

148146
if(WITH_PYTHON)

cmake/FindAVX.cmake

Lines changed: 52 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,36 +3,55 @@
33

44
INCLUDE(CheckCXXSourceRuns)
55

6-
SET(FIND_AVX_10)
7-
SET(FIND_AVX_20)
8-
SET(AVX_FLAGS)
9-
SET(AVX_FOUND)
10-
11-
# Check AVX 2
12-
SET(CMAKE_REQUIRED_FLAGS)
136
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
14-
SET(CMAKE_REQUIRED_FLAGS "-mavx2")
15-
ELSEIF(MSVC AND NOT CMAKE_CL_64) # reserve for WINDOWS
16-
SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
7+
set(MMX_FLAG "-mmmx")
8+
set(SSE2_FLAG "-msse2")
9+
set(SSE3_FLAG "-msse3")
10+
SET(AVX_FLAG "-mavx")
11+
SET(AVX2_FLAG "-mavx2")
12+
ELSEIF(MSVC)
13+
set(MMX_FLAG "/arch:MMX")
14+
set(SSE2_FLAG "/arch:SSE2")
15+
set(SSE3_FLAG "/arch:SSE3")
16+
SET(AVX_FLAG "/arch:AVX")
17+
SET(AVX2_FLAG "/arch:AVX2")
1718
ENDIF()
1819

20+
# Check MMX
21+
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
1922
CHECK_CXX_SOURCE_RUNS("
20-
#include <immintrin.h>
23+
#include <mmintrin.h>
2124
int main()
2225
{
23-
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
24-
__m256i result = _mm256_abs_epi32 (a);
26+
_mm_setzero_si64();
2527
return 0;
26-
}" FIND_AVX_20)
28+
}" MMX_FOUND)
2729

28-
# Check AVX
29-
SET(CMAKE_REQUIRED_FLAGS)
30-
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
31-
SET(CMAKE_REQUIRED_FLAGS "-mavx")
32-
ELSEIF(MSVC AND NOT CMAKE_CL_64)
33-
SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
34-
endif()
30+
# Check SSE2
31+
set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
32+
CHECK_CXX_SOURCE_RUNS("
33+
#include <emmintrin.h>
34+
int main()
35+
{
36+
_mm_setzero_si128();
37+
return 0;
38+
}" SSE2_FOUND)
3539

40+
# Check SSE3
41+
set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
42+
CHECK_CXX_SOURCE_RUNS("
43+
#include <pmmintrin.h>
44+
int main()
45+
{
46+
__m128d a = _mm_set1_pd(6.28);
47+
__m128d b = _mm_set1_pd(3.14);
48+
__m128d result = _mm_addsub_pd(a, b);
49+
result = _mm_movedup_pd(result);
50+
return 0;
51+
}" SSE3_FOUND)
52+
53+
# Check AVX
54+
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
3655
CHECK_CXX_SOURCE_RUNS("
3756
#include <immintrin.h>
3857
int main()
@@ -41,25 +60,17 @@ int main()
4160
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
4261
__m256 result = _mm256_add_ps (a, b);
4362
return 0;
44-
}" FIND_AVX_10)
45-
46-
IF(${FIND_AVX_20})
47-
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
48-
SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
49-
ELSEIF(MSVC)
50-
SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
51-
ENDIF()
52-
ENDIF()
63+
}" AVX_FOUND)
5364

54-
IF(${FIND_AVX_10})
55-
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
56-
SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
57-
ELSEIF(MSVC)
58-
SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
59-
ENDIF()
60-
ENDIF()
65+
# Check AVX 2
66+
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
67+
CHECK_CXX_SOURCE_RUNS("
68+
#include <immintrin.h>
69+
int main()
70+
{
71+
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
72+
__m256i result = _mm256_abs_epi32 (a);
73+
return 0;
74+
}" AVX2_FOUND)
6175

62-
IF(${FIND_AVX_10})
63-
SET(AVX_FOUND TRUE)
64-
MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
65-
ENDIF()
76+
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)

demo/image_classification/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ plot.png
55
train.log
66
image_provider_copy_1.py
77
*pyc
8+
train.list
9+
test.list

demo/image_classification/data/download_cifar.sh

100644100755
File mode changed.

demo/image_classification/image_provider.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -58,24 +58,29 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
5858
settings.logger.info('DataProvider Initialization finished')
5959

6060

61-
@provider(init_hook=hook)
62-
def processData(settings, file_name):
61+
@provider(init_hook=hook, min_pool_size=0)
62+
def processData(settings, file_list):
6363
"""
6464
The main function for loading data.
6565
Load the batch, iterate all the images and labels in this batch.
66-
file_name: the batch file name.
66+
file_list: the batch file list.
6767
"""
68-
data = cPickle.load(io.open(file_name, 'rb'))
69-
indexes = list(range(len(data['images'])))
70-
if settings.is_train:
71-
random.shuffle(indexes)
72-
for i in indexes:
73-
if settings.use_jpeg == 1:
74-
img = image_util.decode_jpeg(data['images'][i])
75-
else:
76-
img = data['images'][i]
77-
img_feat = image_util.preprocess_img(img, settings.img_mean,
78-
settings.img_size, settings.is_train,
79-
settings.color)
80-
label = data['labels'][i]
81-
yield img_feat.tolist(), int(label)
68+
with open(file_list, 'r') as fdata:
69+
lines = [line.strip() for line in fdata]
70+
random.shuffle(lines)
71+
for file_name in lines:
72+
with io.open(file_name.strip(), 'rb') as file:
73+
data = cPickle.load(file)
74+
indexes = list(range(len(data['images'])))
75+
if settings.is_train:
76+
random.shuffle(indexes)
77+
for i in indexes:
78+
if settings.use_jpeg == 1:
79+
img = image_util.decode_jpeg(data['images'][i])
80+
else:
81+
img = data['images'][i]
82+
img_feat = image_util.preprocess_img(img, settings.img_mean,
83+
settings.img_size, settings.is_train,
84+
settings.color)
85+
label = data['labels'][i]
86+
yield img_feat.astype('float32'), int(label)

demo/image_classification/preprocess.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def option_parser():
3535
data_creator = ImageClassificationDatasetCreater(data_dir,
3636
processed_image_size,
3737
color)
38+
data_creator.train_list_name = "train.txt"
39+
data_creator.test_list_name = "test.txt"
3840
data_creator.num_per_batch = 1000
3941
data_creator.overwrite = True
4042
data_creator.create_batches()

demo/image_classification/preprocess.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,6 @@ set -e
1717
data_dir=./data/cifar-out
1818

1919
python preprocess.py -i $data_dir -s 32 -c 1
20+
21+
echo "data/cifar-out/batches/train.txt" > train.list
22+
echo "data/cifar-out/batches/test.txt" > test.list

demo/image_classification/vgg_16_cifar.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
'img_size': 32,'num_classes': 10,
2626
'use_jpeg': 1,'color': "color"}
2727

28-
define_py_data_sources2(train_list=data_dir+"train.list",
29-
test_list=data_dir+'test.list',
28+
define_py_data_sources2(train_list="train.list",
29+
test_list="train.list",
3030
module='image_provider',
3131
obj='processData',
3232
args=args)

doc/cluster/opensource/cluster_train.md

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,24 @@
1-
# Cluster Training
1+
# Distributed Training
22

3-
We provide some simple scripts ```paddle/scripts/cluster_train``` to help you to launch cluster training Job to harness PaddlePaddle's distributed trainning. For MPI and other cluster scheduler refer this naive script to implement more robust cluster training platform by yourself.
3+
In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
44

5-
The following cluster demo is based on RECOMMENDATION local training demo in PaddlePaddle ```demo/recommendation``` directory. Assuming you enter the ```paddle/scripts/cluster_train/``` directory.
5+
[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and Kubernetes.
66

7-
## Pre-requirements
7+
## Prerequisite
88

9-
Firstly,
9+
1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands. We can use `pip` to install fabric:
1010

11-
```bash
11+
```bash
1212
pip install fabric
13-
```
14-
15-
Secondly, go through installing scripts to install PaddlePaddle at all nodes to make sure demo can run as local mode. For CUDA enabled training, we assume that CUDA is installed in ```/usr/local/cuda```, otherwise missed cuda runtime libraries error could be reported at cluster runtime. In one word, the local training environment should be well prepared for the simple scripts.
13+
```
1614

17-
Then you should prepare same ROOT_DIR directory in all nodes. ROOT_DIR is from in cluster_train/conf.py. Assuming that the ROOT_DIR = /home/paddle, you can create ```paddle``` user account as well, at last ```paddle.py``` can ssh connections to all nodes with ```paddle``` user automatically.
15+
1. We need to install PaddlePaddle on all nodes in the cluster. To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
1816

19-
At last you can create ssh mutual trust relationship between all nodes for easy ssh login, otherwise ```password``` should be provided at runtime from ```paddle.py```.
17+
1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes. For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`. In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
2018

2119
## Prepare Job Workspace
2220

23-
```Job workspace``` is defined as one package directory which contains dependency libraries, train data, test data, model config file and all other related file dependencies.
21+
We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
2422

2523
These ```train/test``` data should be prepared before launching cluster job. To satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files, and all nodes in cluster job will handle files with same logical code in normal condition.
2624

paddle/cuda/src/hl_dso_loader.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ static inline std::string join(const std::string& part1, const std::string& part
4848

4949
static inline void GetDsoHandleFromDefaultPath(
5050
std::string& dso_path, void** dso_handle, int dynload_flags) {
51-
LOG(INFO) << "Try to find cuda library: " << dso_path
51+
VLOG(3) << "Try to find cuda library: " << dso_path
5252
<< " from default system path.";
5353
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
5454
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);

0 commit comments

Comments
 (0)