Skip to content

Commit 25241e9

Browse files
authored
Fix paddle env variables. (#11564)
1 parent 4116b55 commit 25241e9

12 files changed

+56
-50
lines changed

benchmark/fluid/fluid_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
9797
return train_program, fluid.default_startup_program()
9898
else:
9999
raise ValueError(
100-
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
100+
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
101101
)
102102

103103

benchmark/fluid/kube_gen_job.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,10 @@ def gen_job():
108108
tn_container["ports"][0]["containerPort"] = spreadport
109109

110110
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
111-
envs.append({"name": "TRAINERS", "value": str(args.trainers)})
111+
envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
112112
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
113113
envs.append({"name": "ENTRY", "value": args.entry})
114-
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
114+
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
115115
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
116116
# NOTE: these directories below are cluster specific, please modify
117117
# this settings before you run on your own cluster.
@@ -167,16 +167,22 @@ def gen_job():
167167
tn_container["volumeMounts"] = volumeMounts
168168

169169
ps_container["env"] = envs
170-
ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
170+
ps_container["env"].append({
171+
"name": "PADDLE_TRAINING_ROLE",
172+
"value": "PSERVER"
173+
})
171174
tn_container["env"] = envs
172175
if args.disttype == "pserver":
173176
tn_container["env"].append({
174-
"name": "TRAINING_ROLE",
177+
"name": "PADDLE_TRAINING_ROLE",
175178
"value": "TRAINER"
176179
})
177180
elif args.disttype == "nccl2" or args.disttype == "local":
178181
# NCCL2 have no training role, set to plain WORKER
179-
tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
182+
tn_container["env"].append({
183+
"name": "PADDLE_TRAINING_ROLE",
184+
"value": "WORKER"
185+
})
180186

181187
os.mkdir(args.jobname)
182188
if args.disttype == "pserver":

doc/fluid/howto/cluster/fluid_cluster_train_cn.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
168168

169169
第二步,启动Parameter Server:
170170
```bash
171-
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
171+
PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
172172
```
173173
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
174174

175175
第三步,启动Trainer:
176176
```bash
177-
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
177+
PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
178178
```
179179
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
180180

doc/fluid/howto/cluster/fluid_recordio.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
114114
ret_list.append(f)
115115
return ret_list
116116

117-
trainers = int(os.getenv("TRAINERS"))
118-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
117+
trainers = int(os.getenv("PADDLE_TRAINERS"))
118+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
119119
data_file = fluid.layers.io.open_files(
120120
filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
121121
thread_num=1,

python/paddle/fluid/tests/book/notest_understand_sentiment.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,16 @@ def train_loop(main_program):
194194
if is_local:
195195
train_loop(fluid.default_main_program())
196196
else:
197-
port = os.getenv("PADDLE_INIT_PORT", "6174")
198-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip...
197+
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
198+
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
199199
eplist = []
200200
for ip in pserver_ips.split(","):
201201
eplist.append(':'.join([ip, port]))
202202
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
203-
trainers = int(os.getenv("TRAINERS"))
203+
trainers = int(os.getenv("PADDLE_TRAINERS"))
204204
current_endpoint = os.getenv("POD_IP") + ":" + port
205-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
206-
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
205+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
206+
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
207207
t = fluid.DistributeTranspiler()
208208
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
209209
if training_role == "PSERVER":

python/paddle/fluid/tests/book/test_fit_a_line.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,16 @@ def train_loop(main_program):
6969
if is_local:
7070
train_loop(fluid.default_main_program())
7171
else:
72-
port = os.getenv("PADDLE_INIT_PORT", "6174")
73-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip...
72+
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
73+
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
7474
eplist = []
7575
for ip in pserver_ips.split(","):
7676
eplist.append(':'.join([ip, port]))
7777
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
78-
trainers = int(os.getenv("TRAINERS"))
78+
trainers = int(os.getenv("PADDLE_TRAINERS"))
7979
current_endpoint = os.getenv("POD_IP") + ":" + port
80-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
81-
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
80+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
81+
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
8282
t = fluid.DistributeTranspiler()
8383
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
8484
if training_role == "PSERVER":

python/paddle/fluid/tests/book/test_image_classification.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,16 +178,16 @@ def train_loop(main_program):
178178
if is_local:
179179
train_loop(fluid.default_main_program())
180180
else:
181-
port = os.getenv("PADDLE_INIT_PORT", "6174")
182-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip...
181+
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
182+
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
183183
eplist = []
184184
for ip in pserver_ips.split(","):
185185
eplist.append(':'.join([ip, port]))
186186
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
187-
trainers = int(os.getenv("TRAINERS"))
187+
trainers = int(os.getenv("PADDLE_TRAINERS"))
188188
current_endpoint = os.getenv("POD_IP") + ":" + port
189-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
190-
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
189+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
190+
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
191191
t = fluid.DistributeTranspiler()
192192
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
193193
if training_role == "PSERVER":

python/paddle/fluid/tests/book/test_label_semantic_roles.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,16 +209,16 @@ def train_loop(main_program):
209209
if is_local:
210210
train_loop(fluid.default_main_program())
211211
else:
212-
port = os.getenv("PADDLE_INIT_PORT", "6174")
213-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip...
212+
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
213+
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
214214
eplist = []
215215
for ip in pserver_ips.split(","):
216216
eplist.append(':'.join([ip, port]))
217217
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
218-
trainers = int(os.getenv("TRAINERS"))
218+
trainers = int(os.getenv("PADDLE_TRAINERS"))
219219
current_endpoint = os.getenv("POD_IP") + ":" + port
220-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
221-
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
220+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
221+
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
222222
t = fluid.DistributeTranspiler()
223223
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
224224
if training_role == "PSERVER":

python/paddle/fluid/tests/book/test_machine_translation.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -200,16 +200,16 @@ def train_loop(main_program):
200200
if is_local:
201201
train_loop(framework.default_main_program())
202202
else:
203-
port = os.getenv("PADDLE_INIT_PORT", "6174")
204-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip...
203+
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
204+
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
205205
eplist = []
206206
for ip in pserver_ips.split(","):
207207
eplist.append(':'.join([ip, port]))
208208
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
209-
trainers = int(os.getenv("TRAINERS"))
209+
trainers = int(os.getenv("PADDLE_TRAINERS"))
210210
current_endpoint = os.getenv("POD_IP") + ":" + port
211-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
212-
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
211+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
212+
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
213213
t = fluid.DistributeTranspiler()
214214
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
215215
if training_role == "PSERVER":

python/paddle/fluid/tests/book/test_recognize_digits.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,16 +151,16 @@ def train_loop(main_program):
151151
if is_local:
152152
train_loop(fluid.default_main_program())
153153
else:
154-
port = os.getenv("PADDLE_INIT_PORT", "6174")
155-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip...
154+
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
155+
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
156156
eplist = []
157157
for ip in pserver_ips.split(","):
158158
eplist.append(':'.join([ip, port]))
159159
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
160-
trainers = int(os.getenv("TRAINERS"))
160+
trainers = int(os.getenv("PADDLE_TRAINERS"))
161161
current_endpoint = os.getenv("POD_IP") + ":" + port
162-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
163-
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
162+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
163+
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
164164
t = fluid.DistributeTranspiler()
165165
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
166166
if training_role == "PSERVER":

0 commit comments

Comments
 (0)