@@ -136,6 +136,8 @@ class DistributeTranspilerConfig(object):
136
136
slice_var_up = True
137
137
split_method = None
138
138
min_block_size = 8192
139
+ # supported modes: pserver, nccl2
140
+ mode = "pserver"
139
141
print_log = False
140
142
141
143
@@ -144,27 +146,30 @@ class DistributeTranspiler(object):
144
146
**DistributeTranspiler**
145
147
146
148
Convert the fluid program to distributed data-parallelism programs.
149
+ Supports two modes: pserver mode and nccl2 mode.
147
150
148
- The main_program will be transformed to use a remote parameter server
149
- to do parameter optimization. And the optimization graph will be put
150
- into a parameter server program.
151
+ In pserver mode, the main_program will be transformed to use a remote
152
+ parameter server to do parameter optimization. And the optimization
153
+ graph will be put into a parameter server program.
154
+
155
+ In nccl2 mode, the transpiler will append a NCCL_ID broadcasting
156
+ op in startup_program to share the NCCL_ID across the job nodes.
157
+ After transpile_nccl2 called, you ***must*** pass trainer_id and
158
+ num_trainers argument to ParallelExecutor to enable NCCL2 distributed
159
+ mode.
151
160
152
161
Examples:
153
162
.. code-block:: python
154
163
155
- # Define your model before these codes.
156
- port = os.getenv("PADDLE_PSERVER_PORT", "6174")
157
- pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
158
- eplist = []
159
- for ip in pserver_ips.split(","):
160
- eplist.append(':'.join([ip, port]))
161
- pserver_endpoints = ",".join(eplist)
162
- trainers = int(os.getenv("PADDLE_TRAINERS"))
163
- current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
164
- trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
164
+ # for pserver mode
165
+ pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
166
+ trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
167
+ current_endpoint = "192.168.0.1:6174"
168
+ trainer_id = 0
169
+ trainers = 4
165
170
role = os.getenv("PADDLE_TRAINING_ROLE")
166
171
167
- t = distribute_transpiler .DistributeTranspiler()
172
+ t = fluid .DistributeTranspiler()
168
173
t.transpile(
169
174
trainer_id, pservers=pserver_endpoints, trainers=trainers)
170
175
if role == "PSERVER":
@@ -173,6 +178,18 @@ class DistributeTranspiler(object):
173
178
pserver_program)
174
179
elif role == "TRAINER":
175
180
trainer_program = t.get_trainer_program()
181
+
182
+ # for nccl2 mode
183
+ config = fluid.DistributeTranspilerConfig()
184
+ config.mode = "nccl2"
185
+ t = fluid.DistributeTranspiler(config=config)
186
+ t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep)
187
+ exe = fluid.ParallelExecutor(
188
+ use_cuda,
189
+ loss_name=loss_var.name,
190
+ num_trainers=len(trainers.split(",)),
191
+ trainer_id=trainer_id
192
+ )
176
193
"""
177
194
178
195
def __init__ (self , config = None ):
@@ -190,13 +207,41 @@ def __init__(self, config=None):
190
207
assert (self .config .min_block_size >= 8192 )
191
208
assert (self .config .split_method .__bases__ [0 ] == PSDispatcher )
192
209
210
+ def _transpile_nccl2 (self ,
211
+ trainer_id ,
212
+ trainers ,
213
+ current_endpoint ,
214
+ startup_program = None ):
215
+ if not startup_program :
216
+ startup_program = default_startup_program ()
217
+ if trainer_id >= 0 :
218
+ worker_endpoints = trainers .split ("," )
219
+ # send NCCL_ID to others or recv from trainer 0
220
+ worker_endpoints .remove (current_endpoint )
221
+
222
+ nccl_id_var = startup_program .global_block ().create_var (
223
+ name = "NCCLID" , persistable = True , type = core .VarDesc .VarType .RAW )
224
+ startup_program .global_block ().append_op (
225
+ type = "gen_nccl_id" ,
226
+ inputs = {},
227
+ outputs = {"NCCLID" : nccl_id_var },
228
+ attrs = {
229
+ "endpoint" : current_endpoint ,
230
+ "endpoint_list" : worker_endpoints ,
231
+ "trainer_id" : trainer_id
232
+ })
233
+ return nccl_id_var
234
+ else :
235
+ raise ValueError ("must set trainer_id > 0" )
236
+
193
237
def transpile (self ,
194
238
trainer_id ,
195
239
program = None ,
196
240
pservers = "127.0.0.1:6174" ,
197
241
trainers = 1 ,
198
242
sync_mode = True ,
199
- startup_program = None ):
243
+ startup_program = None ,
244
+ current_endpoint = "127.0.0.1:6174" ):
200
245
"""
201
246
Run the transpiler.
202
247
@@ -207,10 +252,15 @@ def transpile(self,
207
252
default is fluid.default_main_program().
208
253
pservers (str): comma separated ip:port string for the pserver
209
254
list.
210
- trainers (int): number of trainers in the distributed job.
255
+ trainers (int|str): in pserver mode this is the number of
256
+ trainers, in nccl2 mode this is a string of trainer
257
+ endpoints.
211
258
sync_mode (bool): Do sync training or not, default is True.
212
259
startup_program (Program|None): startup_program to transpile,
213
260
default is fluid.default_main_program().
261
+ current_endpoint (str): need pass current endpoint when
262
+ transpile as nccl2 distributed mode. In pserver mode
263
+ this argument is not used.
214
264
"""
215
265
if program is None :
216
266
program = default_main_program ()
@@ -220,6 +270,15 @@ def transpile(self,
220
270
self .startup_program = startup_program
221
271
self .origin_startup_program = self .startup_program .clone ()
222
272
273
+ if self .config .mode == "nccl2" :
274
+ assert (isinstance (trainers , str ))
275
+ self ._transpile_nccl2 (
276
+ trainer_id ,
277
+ trainers ,
278
+ current_endpoint ,
279
+ startup_program = startup_program )
280
+ return
281
+
223
282
self .trainer_num = trainers
224
283
self .sync_mode = sync_mode
225
284
self .trainer_id = trainer_id
0 commit comments