|
| 1 | +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import numpy as np |
| 16 | +import argparse |
| 17 | +import time |
| 18 | +import math |
| 19 | + |
| 20 | +import paddle |
| 21 | +import paddle.fluid as fluid |
| 22 | +import paddle.fluid.profiler as profiler |
| 23 | +from paddle.fluid import core |
| 24 | +import unittest |
| 25 | +from multiprocessing import Process |
| 26 | +import os |
| 27 | +import sys |
| 28 | +import signal |
| 29 | + |
| 30 | +# Fix seed for test |
| 31 | +fluid.default_startup_program().random_seed = 1 |
| 32 | +fluid.default_main_program().random_seed = 1 |
| 33 | + |
| 34 | +train_parameters = { |
| 35 | + "input_size": [3, 224, 224], |
| 36 | + "input_mean": [0.485, 0.456, 0.406], |
| 37 | + "input_std": [0.229, 0.224, 0.225], |
| 38 | + "learning_strategy": { |
| 39 | + "name": "piecewise_decay", |
| 40 | + "epochs": [30, 60, 90], |
| 41 | + "steps": [0.1, 0.01, 0.001, 0.0001] |
| 42 | + } |
| 43 | +} |
| 44 | + |
| 45 | + |
| 46 | +class SE_ResNeXt(): |
| 47 | + def __init__(self, layers=50): |
| 48 | + self.params = train_parameters |
| 49 | + self.layers = layers |
| 50 | + |
| 51 | + def net(self, input, class_dim=1000): |
| 52 | + layers = self.layers |
| 53 | + supported_layers = [50, 101, 152] |
| 54 | + assert layers in supported_layers, \ |
| 55 | + "supported layers are {} but input layer is {}".format(supported_layers, layers) |
| 56 | + if layers == 50: |
| 57 | + cardinality = 32 |
| 58 | + reduction_ratio = 16 |
| 59 | + depth = [3, 4, 6, 3] |
| 60 | + num_filters = [128, 256, 512, 1024] |
| 61 | + |
| 62 | + conv = self.conv_bn_layer( |
| 63 | + input=input, |
| 64 | + num_filters=64, |
| 65 | + filter_size=7, |
| 66 | + stride=2, |
| 67 | + act='relu') |
| 68 | + conv = fluid.layers.pool2d( |
| 69 | + input=conv, |
| 70 | + pool_size=3, |
| 71 | + pool_stride=2, |
| 72 | + pool_padding=1, |
| 73 | + pool_type='max') |
| 74 | + elif layers == 101: |
| 75 | + cardinality = 32 |
| 76 | + reduction_ratio = 16 |
| 77 | + depth = [3, 4, 23, 3] |
| 78 | + num_filters = [128, 256, 512, 1024] |
| 79 | + |
| 80 | + conv = self.conv_bn_layer( |
| 81 | + input=input, |
| 82 | + num_filters=64, |
| 83 | + filter_size=7, |
| 84 | + stride=2, |
| 85 | + act='relu') |
| 86 | + conv = fluid.layers.pool2d( |
| 87 | + input=conv, |
| 88 | + pool_size=3, |
| 89 | + pool_stride=2, |
| 90 | + pool_padding=1, |
| 91 | + pool_type='max') |
| 92 | + elif layers == 152: |
| 93 | + cardinality = 64 |
| 94 | + reduction_ratio = 16 |
| 95 | + depth = [3, 8, 36, 3] |
| 96 | + num_filters = [128, 256, 512, 1024] |
| 97 | + |
| 98 | + conv = self.conv_bn_layer( |
| 99 | + input=input, |
| 100 | + num_filters=64, |
| 101 | + filter_size=3, |
| 102 | + stride=2, |
| 103 | + act='relu') |
| 104 | + conv = self.conv_bn_layer( |
| 105 | + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') |
| 106 | + conv = self.conv_bn_layer( |
| 107 | + input=conv, |
| 108 | + num_filters=128, |
| 109 | + filter_size=3, |
| 110 | + stride=1, |
| 111 | + act='relu') |
| 112 | + conv = fluid.layers.pool2d( |
| 113 | + input=conv, pool_size=3, pool_stride=2, pool_padding=1, \ |
| 114 | + pool_type='max') |
| 115 | + |
| 116 | + for block in range(len(depth)): |
| 117 | + for i in range(depth[block]): |
| 118 | + conv = self.bottleneck_block( |
| 119 | + input=conv, |
| 120 | + num_filters=num_filters[block], |
| 121 | + stride=2 if i == 0 and block != 0 else 1, |
| 122 | + cardinality=cardinality, |
| 123 | + reduction_ratio=reduction_ratio) |
| 124 | + |
| 125 | + pool = fluid.layers.pool2d( |
| 126 | + input=conv, pool_size=7, pool_type='avg', global_pooling=True) |
| 127 | + drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) |
| 128 | + stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) |
| 129 | + out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') |
| 130 | + return out |
| 131 | + |
| 132 | + def shortcut(self, input, ch_out, stride): |
| 133 | + ch_in = input.shape[1] |
| 134 | + if ch_in != ch_out or stride != 1: |
| 135 | + filter_size = 1 |
| 136 | + return self.conv_bn_layer(input, ch_out, filter_size, stride) |
| 137 | + else: |
| 138 | + return input |
| 139 | + |
| 140 | + def bottleneck_block(self, input, num_filters, stride, cardinality, |
| 141 | + reduction_ratio): |
| 142 | + conv0 = self.conv_bn_layer( |
| 143 | + input=input, num_filters=num_filters, filter_size=1, act='relu') |
| 144 | + conv1 = self.conv_bn_layer( |
| 145 | + input=conv0, |
| 146 | + num_filters=num_filters, |
| 147 | + filter_size=3, |
| 148 | + stride=stride, |
| 149 | + groups=cardinality, |
| 150 | + act='relu') |
| 151 | + conv2 = self.conv_bn_layer( |
| 152 | + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) |
| 153 | + scale = self.squeeze_excitation( |
| 154 | + input=conv2, |
| 155 | + num_channels=num_filters * 2, |
| 156 | + reduction_ratio=reduction_ratio) |
| 157 | + |
| 158 | + short = self.shortcut(input, num_filters * 2, stride) |
| 159 | + |
| 160 | + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') |
| 161 | + |
| 162 | + def conv_bn_layer(self, |
| 163 | + input, |
| 164 | + num_filters, |
| 165 | + filter_size, |
| 166 | + stride=1, |
| 167 | + groups=1, |
| 168 | + act=None): |
| 169 | + conv = fluid.layers.conv2d( |
| 170 | + input=input, |
| 171 | + num_filters=num_filters, |
| 172 | + filter_size=filter_size, |
| 173 | + stride=stride, |
| 174 | + padding=(filter_size - 1) / 2, |
| 175 | + groups=groups, |
| 176 | + act=None, |
| 177 | + bias_attr=False) |
| 178 | + return fluid.layers.batch_norm(input=conv, act=act) |
| 179 | + |
| 180 | + def squeeze_excitation(self, input, num_channels, reduction_ratio): |
| 181 | + pool = fluid.layers.pool2d( |
| 182 | + input=input, pool_size=0, pool_type='avg', global_pooling=True) |
| 183 | + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) |
| 184 | + squeeze = fluid.layers.fc(input=pool, |
| 185 | + size=num_channels / reduction_ratio, |
| 186 | + act='relu') |
| 187 | + stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) |
| 188 | + excitation = fluid.layers.fc(input=squeeze, |
| 189 | + size=num_channels, |
| 190 | + act='sigmoid') |
| 191 | + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) |
| 192 | + return scale |
| 193 | + |
| 194 | + |
| 195 | +def get_model(batch_size): |
| 196 | + # Input data |
| 197 | + image = fluid.layers.fill_constant( |
| 198 | + shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) |
| 199 | + label = fluid.layers.fill_constant( |
| 200 | + shape=[batch_size, 1], dtype='int64', value=0.0) |
| 201 | + |
| 202 | + # Train program |
| 203 | + model = SE_ResNeXt(layers=50) |
| 204 | + out = model.net(input=image, class_dim=102) |
| 205 | + cost = fluid.layers.cross_entropy(input=out, label=label) |
| 206 | + |
| 207 | + avg_cost = fluid.layers.mean(x=cost) |
| 208 | + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) |
| 209 | + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) |
| 210 | + |
| 211 | + # Evaluator |
| 212 | + test_program = fluid.default_main_program().clone(for_test=True) |
| 213 | + |
| 214 | + # Optimization |
| 215 | + total_images = 6149 # flowers |
| 216 | + epochs = [30, 60, 90] |
| 217 | + step = int(total_images / batch_size + 1) |
| 218 | + |
| 219 | + bd = [step * e for e in epochs] |
| 220 | + base_lr = 0.1 |
| 221 | + lr = [] |
| 222 | + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] |
| 223 | + |
| 224 | + optimizer = fluid.optimizer.Momentum( |
| 225 | + learning_rate=fluid.layers.piecewise_decay( |
| 226 | + boundaries=bd, values=lr), |
| 227 | + momentum=0.9, |
| 228 | + regularization=fluid.regularizer.L2Decay(1e-4)) |
| 229 | + optimizer.minimize(avg_cost) |
| 230 | + |
| 231 | + # Reader |
| 232 | + train_reader = paddle.batch( |
| 233 | + paddle.dataset.flowers.train(), batch_size=batch_size) |
| 234 | + test_reader = paddle.batch( |
| 235 | + paddle.dataset.flowers.test(), batch_size=batch_size) |
| 236 | + |
| 237 | + return test_program, avg_cost, train_reader, test_reader, acc_top1, out |
| 238 | + |
| 239 | + |
| 240 | +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): |
| 241 | + t = fluid.DistributeTranspiler() |
| 242 | + t.transpile( |
| 243 | + trainer_id=trainer_id, |
| 244 | + program=main_program, |
| 245 | + pservers=pserver_endpoints, |
| 246 | + trainers=trainers) |
| 247 | + return t |
| 248 | + |
| 249 | + |
| 250 | +class DistSeResneXt2x2: |
| 251 | + def run_pserver(self, pserver_endpoints, trainers, current_endpoint, |
| 252 | + trainer_id): |
| 253 | + get_model(batch_size=2) |
| 254 | + t = get_transpiler(trainer_id, |
| 255 | + fluid.default_main_program(), pserver_endpoints, |
| 256 | + trainers) |
| 257 | + pserver_prog = t.get_pserver_program(current_endpoint) |
| 258 | + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) |
| 259 | + |
| 260 | + place = fluid.CPUPlace() |
| 261 | + exe = fluid.Executor(place) |
| 262 | + exe.run(startup_prog) |
| 263 | + exe.run(pserver_prog) |
| 264 | + |
| 265 | + def _wait_ps_ready(self, pid): |
| 266 | + retry_times = 20 |
| 267 | + while True: |
| 268 | + assert retry_times >= 0, "wait ps ready failed" |
| 269 | + time.sleep(3) |
| 270 | + print("waiting ps ready: ", pid) |
| 271 | + try: |
| 272 | + # the listen_and_serv_op would touch a file which contains the listen port |
| 273 | + # on the /tmp directory until it was ready to process all the RPC call. |
| 274 | + os.stat("/tmp/paddle.%d.port" % pid) |
| 275 | + return |
| 276 | + except os.error: |
| 277 | + retry_times -= 1 |
| 278 | + |
| 279 | + def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): |
| 280 | + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( |
| 281 | + batch_size=20) |
| 282 | + if is_dist: |
| 283 | + t = get_transpiler(trainer_id, |
| 284 | + fluid.default_main_program(), endpoints, |
| 285 | + trainers) |
| 286 | + trainer_prog = t.get_trainer_program() |
| 287 | + else: |
| 288 | + trainer_prog = fluid.default_main_program() |
| 289 | + |
| 290 | + startup_exe = fluid.Executor(place) |
| 291 | + startup_exe.run(fluid.default_startup_program()) |
| 292 | + |
| 293 | + strategy = fluid.ExecutionStrategy() |
| 294 | + strategy.num_threads = 1 |
| 295 | + strategy.allow_op_delay = False |
| 296 | + exe = fluid.ParallelExecutor( |
| 297 | + True, |
| 298 | + loss_name=avg_cost.name, |
| 299 | + exec_strategy=strategy, |
| 300 | + num_trainers=trainers, |
| 301 | + trainer_id=trainer_id) |
| 302 | + |
| 303 | + feed_var_list = [ |
| 304 | + var for var in trainer_prog.global_block().vars.itervalues() |
| 305 | + if var.is_data |
| 306 | + ] |
| 307 | + |
| 308 | + feeder = fluid.DataFeeder(feed_var_list, place) |
| 309 | + reader_generator = train_reader() |
| 310 | + first_loss, = exe.run(fetch_list=[avg_cost.name]) |
| 311 | + print(first_loss) |
| 312 | + for i in xrange(5): |
| 313 | + loss, = exe.run(fetch_list=[avg_cost.name]) |
| 314 | + last_loss, = exe.run(fetch_list=[avg_cost.name]) |
| 315 | + print(last_loss) |
| 316 | + |
| 317 | + |
| 318 | +def main(role="pserver", |
| 319 | + endpoints="127.0.0.1:9123", |
| 320 | + trainer_id=0, |
| 321 | + current_endpoint="127.0.0.1:9123", |
| 322 | + trainers=1, |
| 323 | + is_dist=True): |
| 324 | + model = DistSeResneXt2x2() |
| 325 | + if role == "pserver": |
| 326 | + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) |
| 327 | + else: |
| 328 | + p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( |
| 329 | + ) else fluid.CPUPlace() |
| 330 | + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) |
| 331 | + |
| 332 | + |
| 333 | +if __name__ == "__main__": |
| 334 | + if len(sys.argv) != 7: |
| 335 | + print( |
| 336 | + "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" |
| 337 | + ) |
| 338 | + role = sys.argv[1] |
| 339 | + endpoints = sys.argv[2] |
| 340 | + trainer_id = int(sys.argv[3]) |
| 341 | + current_endpoint = sys.argv[4] |
| 342 | + trainers = int(sys.argv[5]) |
| 343 | + is_dist = True if sys.argv[6] == "TRUE" else False |
| 344 | + main( |
| 345 | + role=role, |
| 346 | + endpoints=endpoints, |
| 347 | + trainer_id=trainer_id, |
| 348 | + current_endpoint=current_endpoint, |
| 349 | + trainers=trainers, |
| 350 | + is_dist=is_dist) |
0 commit comments