Skip to content

Commit 23f5c18

Browse files
author
kavyasrinet
authored
Fixed few comments in transpiler (#7748)
* Updating the cluster trainign doc * Fixed few comments of transpiler * Adding few explanations
1 parent b7eeef2 commit 23f5c18

File tree

1 file changed

+28
-21
lines changed

1 file changed

+28
-21
lines changed

python/paddle/v2/fluid/distribute_transpiler.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ def split_dense_variable(var_list,
3838
min_block_size=1024,
3939
max_block_size=1048576):
4040
"""
41-
We may need to split dense tensor to one or several blocks and put
41+
We may need to split dense tensor to one or more blocks and put
4242
them equally onto parameter server. One block is a sub-tensor
4343
aligned by dim[0] of the tensor.
44-
44+
4545
We need to have a minimal block size so that the calculations in
4646
the parameter server side can gain better performance. By default
47-
mininum block size is 1024. The max block size is used to prevent
48-
too large block that may causing send error.
47+
minimum block size is 1024. The max block size is used to prevent
48+
very large blocks that may cause send error.
4949
"""
5050
blocks = []
5151
for var in var_list:
@@ -64,7 +64,7 @@ def split_dense_variable(var_list,
6464
remains = block_size % dim1
6565
if remains != 0:
6666
block_size += dim1 - remains
67-
# update split_count after align
67+
# update split_count after aligning
6868
split_count = int(math.ceil(var_numel / float(block_size)))
6969
for block_id in xrange(split_count):
7070
curr_block_size = min(block_size, var_numel - (
@@ -83,18 +83,18 @@ def transpile(self,
8383
trainers=1,
8484
split_method=round_robin):
8585
"""
86-
Transpile the program to a distributed data-parallelism programs.
87-
The main_program will be transform to use a remote parameter server
86+
Transpile the program to distributed data-parallelism programs.
87+
The main_program will be transformed to use a remote parameter server
8888
to do parameter optimization. And the optimization graph will be put
89-
in to a parameter server program.
89+
into a parameter server program.
9090
91-
Use different methods to split trainable varialbles to different
91+
Use different methods to split trainable variables to different
9292
parameter servers.
9393
9494
:param optimize_ops: op list of optimization, should be the
9595
return value of Optimizer.minimize
9696
:type optimize_ops: list
97-
:param program: program to optimize, default default_main_program
97+
:param program: program to optimize, default is default_main_program
9898
:param pservers: parameter server endpoints like "m1:6174,m2:6174"
9999
:type pservers: string
100100
:return: return a list of programs
@@ -106,11 +106,11 @@ def transpile(self,
106106
self.trainers = trainers
107107
self.optimize_ops = optimize_ops
108108
# steps to transpile:
109-
# 1. split variable to multiple blocks, align by product(dim[1:]) (width).
109+
# 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
110110
# 2. modify trainer program add split_op to each Grad.
111111
# 3. append send_op to trainer.
112112
# 4. append concat_op to trainer to update local weights.
113-
# 5. create new program as parameter server.
113+
# 5. create new program for parameter server.
114114
# 6. create parameter server program by split_method generated endpoint->VarBlock
115115

116116
pserver_endpoints = pservers.split(",")
@@ -136,10 +136,10 @@ def transpile(self,
136136
for b in param_blocks:
137137
varname, block_id, _ = b.split(":")
138138
send_outputs.append(param_var_mapping[varname][int(block_id)])
139-
# let send_op know which endpoint to send which var, eplist is of the same
140-
# order of send_inputs.
139+
# let send_op know which endpoint to send which var to, eplist has the same
140+
# order as send_inputs.
141141
eplist = split_method(send_inputs, pserver_endpoints)
142-
# create mapping of endpoint -> splited var to create pserver side program
142+
# create mapping of endpoint -> split var to create pserver side program
143143
self.param_grad_ep_mapping = dict()
144144
for i, ep in enumerate(eplist):
145145
param = send_outputs[i]
@@ -149,6 +149,7 @@ def transpile(self,
149149
self.param_grad_ep_mapping[ep]["params"].append(param)
150150
self.param_grad_ep_mapping[ep]["grads"].append(grad)
151151

152+
# create send_op
152153
send_op = program.global_block().append_op(
153154
type="send",
154155
inputs={"X": send_inputs},
@@ -167,6 +168,7 @@ def transpile(self,
167168
attrs={"axis": 0})
168169

169170
def _create_vars_from_blocklist(self, program, block_list):
171+
# Create respective variables using the block_list
170172
block_map = dict()
171173
var_mapping = dict()
172174
for block_str in block_list:
@@ -207,11 +209,12 @@ def _clone_var(self, block, var):
207209
dtype=var.dtype,
208210
type=var.type,
209211
lod_level=var.lod_level,
210-
# HACK: let all param in pserver persistable so child
212+
# HACK: let all param in pserver be persistable so the child
211213
# program in recv can get them
212214
persistable=True)
213215

214216
def _append_split_op(self, program, gradblocks):
217+
# Split variables that need to be split and append respective ops
215218
var_mapping = self._create_vars_from_blocklist(program, gradblocks)
216219
for varname, splited_vars in var_mapping.iteritems():
217220
# variable that don't need to split have empty splited_vars
@@ -248,6 +251,7 @@ def get_trainer_program(self):
248251
return self.program
249252

250253
def _create_var_for_trainers(self, block, var, trainers):
254+
# For each trainer, create the necessary variables
251255
var_list = []
252256
for i in xrange(trainers):
253257
var_each = block.create_var(
@@ -262,7 +266,7 @@ def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
262266
param_shape):
263267
"""
264268
Returns the shape for optimizer inputs that need to be reshaped when
265-
Param and Grad is splited to multiple servers.
269+
Param and Grad is split to multiple servers.
266270
"""
267271
# HACK(typhoonzero): Should use functions of corresponding optimizer in
268272
# optimizer.py to get the shape, do not bind this in the transpiler.
@@ -300,7 +304,7 @@ def _is_op_on_pserver(self, endpoint, all_ops, idx):
300304
else:
301305
for n in param_names:
302306
if n.startswith(op.inputs["Param"].name+".block") and \
303-
n != op.inputs["Param"].name:
307+
n != op.inputs["Param"].name:
304308
return True
305309
return False
306310
else:
@@ -396,7 +400,7 @@ def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
396400
dtype=var.dtype,
397401
shape=new_shape)
398402

399-
# change outputs ParamOut variable
403+
# change output's ParamOut variable
400404
opt_op.outputs["ParamOut"] = new_inputs["Param"]
401405
program.global_block().append_op(
402406
type=opt_op.type,
@@ -405,6 +409,7 @@ def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
405409
attrs=opt_op.attrs)
406410

407411
def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
412+
# Append the ops for parameters that do not need to be optimized/updated
408413
for _, var in opt_op.inputs.iteritems():
409414
program.global_block().create_var(
410415
name=var.name,
@@ -424,7 +429,7 @@ def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
424429

425430
def get_pserver_program(self, endpoint):
426431
"""
427-
get pserver side program by endpoint
432+
Get pserver side program using the endpoint
428433
429434
NOTE: assume blocks of the same variable is not distributed
430435
on the same pserver, only change param/grad varnames for
@@ -450,6 +455,7 @@ def get_pserver_program(self, endpoint):
450455
shape=v.shape)
451456
# step6
452457
optimize_sub_program = Program()
458+
# Iterate through the ops and append ops as needed
453459
for idx, opt_op in enumerate(self.optimize_ops):
454460
is_op_on_pserver = self._is_op_on_pserver(endpoint,
455461
self.optimize_ops, idx)
@@ -461,6 +467,7 @@ def get_pserver_program(self, endpoint):
461467
else:
462468
self._append_pserver_non_opt_ops(optimize_sub_program,
463469
pserver_program, opt_op)
470+
# Append the recv op
464471
pserver_program.global_block().append_op(
465472
type="recv",
466473
inputs={"RX": self.param_grad_ep_mapping[endpoint]["grads"]
@@ -486,7 +493,7 @@ def get_startup_program(self, endpoint, pserver_program):
486493
"""
487494
Get startup program for current parameter server.
488495
Modify operator input variables if there are variables that
489-
was splited to several blocks.
496+
were split to several blocks.
490497
"""
491498
s_prog = Program()
492499
orig_s_prog = framework.default_startup_program()

0 commit comments

Comments
 (0)