Skip to content

Commit fca1c52

Browse files
Engine clean (#92)
* add cluster running * add k8s/paddle cloud * fix local cluster without gpu * define engine to train/infer/local_cluster_train/cluster_train * fix gpu cluster wait * fix bug for 2ps 2tr Co-authored-by: tangwei <[email protected]>
1 parent 8270dd0 commit fca1c52

File tree

46 files changed

+745
-139
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+745
-139
lines changed

core/engine/cluster/cluster.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,13 @@ def __init_impl__(self):
2929
abs_dir = os.path.dirname(os.path.abspath(__file__))
3030

3131
backend = envs.get_runtime_environ("engine_backend")
32-
if backend == "PaddleCloud":
32+
if not backend:
33+
backend = ""
34+
backend = backend.upper()
35+
if backend == "PADDLECLOUD":
3336
self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
37+
elif backend == "KUBERNETES":
38+
self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh")
3439
else:
3540
raise ValueError("{} can not be supported now".format(backend))
3641

@@ -48,6 +53,14 @@ def start_master_procs(self):
4853
proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd())
4954
proc.wait()
5055

56+
@staticmethod
57+
def workspace_replace():
58+
workspace = envs.get_runtime_environ("engine_workspace")
59+
60+
for k, v in os.environ.items():
61+
v = v.replace("{workspace}", workspace)
62+
os.environ[k] = str(v)
63+
5164
def run(self):
5265
role = envs.get_runtime_environ("engine_role")
5366

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

core/engine/cluster/k8s/cluster.sh

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/bash
2+
3+
###################################################
4+
# Usage: submit.sh
5+
# Description: run k8s submit client implement
6+
###################################################
7+
8+
# ---------------------------------------------------------------------------- #
9+
# variable define #
10+
# ---------------------------------------------------------------------------- #
11+
12+
#-----------------------------------------------------------------------------------------------------------------
13+
#fun : create ConfigMap for k8s pod
14+
#param : N/A
15+
#return : 0 -- success; not 0 -- failure
16+
#-----------------------------------------------------------------------------------------------------------------
17+
function create_config_map() {
18+
echo "Create configmap"
19+
echo "Delete exist configmap which named 'modelconfig'"
20+
kubectl delete configmap modelconfig
21+
kubectl create configmap modelconfig --from-file=${abs_dir}/k8s/set_k8s_env.sh,${paddlerec_model_config}
22+
}
23+
24+
#-----------------------------------------------------------------------------------------------------------------
25+
#fun : create yaml config for k8s job
26+
#param : N/A
27+
#return : 0 -- success; not 0 -- failure
28+
#-----------------------------------------------------------------------------------------------------------------
29+
function create_k8s_yaml() {
30+
echo "Create k8s.yaml"
31+
if [ -f ${PWD}/k8s.yaml ]; then
32+
echo "Delete exist k8s.yaml at ${PWD}"
33+
rm ${PWD}/k8s.yaml
34+
fi
35+
36+
let total_pod_num=${engine_submit_trainer_num}+${engine_submit_server_num}
37+
echo "--K8S ENV-- Job name: ${engine_job_name}"
38+
echo "--K8S ENV-- Total pod nums: ${total_pod_num}"
39+
echo "--K8S ENV-- Trainer nums: ${engine_submit_trainer_num}"
40+
echo "--K8S ENV-- Pserver nums: ${engine_submit_server_num}"
41+
echo "--K8S ENV-- Docker image: ${engine_submit_docker_image}"
42+
echo "--K8S ENV-- Threads(cpu_num) ${CPU_NUM}"
43+
echo "--K8S ENV-- Memory ${engine_submit_memory}"
44+
echo "--K8S ENV-- Storage ${engine_submit_storage}"
45+
echo "--K8S ENV-- Log level ${engine_submit_log_level}"
46+
47+
sed -e "s#<$ JOB_NAME $>#$engine_job_name#g" \
48+
-e "s#<$ TOTAL_POD_NUM $>#$total_pod_num#g" \
49+
-e "s#<$ TRAINER_NUM $>#$engine_submit_trainer_num#g" \
50+
-e "s#<$ PSERVER_NUM $>#$engine_submit_server_num#g" \
51+
-e "s#<$ IMAGE $>#$engine_submit_docker_image#g" \
52+
-e "s#<$ CPU_NUM $>#$CPU_NUM#g" \
53+
-e "s#<$ MEMORY $>#$engine_submit_memory#g" \
54+
-e "s#<$ STORAGE $>#$engine_submit_storage#g" \
55+
-e "s#<$ GLOG_V $>#$engine_submit_log_level#g" \
56+
${abs_dir}/k8s/k8s.yaml.template >${PWD}/k8s.yaml
57+
}
58+
59+
#-----------------------------------------------------------------------------------------------------------------
60+
#fun : submit to k8s cluster
61+
#param : N/A
62+
#return : 0 -- success; not 0 -- failure
63+
#-----------------------------------------------------------------------------------------------------------------
64+
function submit() {
65+
echo "Submit"
66+
echo "Delete exist job which named ${engine_job_name}"
67+
kubectl delete jobs.batch.volcano.sh $engine_job_name
68+
kubectl apply -f ${PWD}/k8s.yaml
69+
}
70+
71+
function main() {
72+
create_config_map
73+
create_k8s_yaml
74+
submit
75+
}
76+
77+
main
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
name: <$ JOB_NAME $>
5+
spec:
6+
minAvailable: <$ TOTAL_POD_NUM $>
7+
schedulerName: volcano
8+
policies:
9+
- event: PodEvicted
10+
action: RestartJob
11+
- event: PodFailed
12+
action: RestartJob
13+
tasks:
14+
- replicas: <$ PSERVER_NUM $>
15+
name: pserver
16+
template:
17+
metadata:
18+
labels:
19+
paddle-job-pserver: paddle-rec
20+
spec:
21+
imagePullSecrets:
22+
- name: default-secret
23+
containers:
24+
- image: <$ IMAGE $>
25+
command:
26+
- '/bin/bash'
27+
args:
28+
- "-c"
29+
- |
30+
set -ex
31+
sh /usr/paddlerec/set_k8s_env.sh start_fluid
32+
imagePullPolicy: Always
33+
volumeMounts:
34+
- name: model-config
35+
mountPath: "/usr/paddlerec"
36+
name: pserver
37+
resources:
38+
limits:
39+
cpu: <$ CPU_NUM $>
40+
memory: <$ MEMORY $>
41+
ephemeral-storage: <$ STORAGE $>
42+
requests:
43+
cpu: 1
44+
memory: 1Gi
45+
ephemeral-storage: 1Gi
46+
env:
47+
- name: GLOG_v
48+
value: "<$ GLOG_V $>"
49+
- name: GLOG_logtostderr
50+
value: "1"
51+
- name: TOPOLOGY
52+
value: ""
53+
- name: TRAINER_PACKAGE
54+
value: /root/paddlejob
55+
- name: PADDLE_INIT_NICS
56+
value: eth2
57+
- name: NAMESPACE
58+
valueFrom:
59+
fieldRef:
60+
apiVersion: v1
61+
fieldPath: metadata.namespace
62+
- name: POD_IP
63+
valueFrom:
64+
fieldRef:
65+
apiVersion: v1
66+
fieldPath: status.podIP
67+
- name: POD_NAME
68+
valueFrom:
69+
fieldRef:
70+
apiVersion: v1
71+
fieldPath: metadata.name
72+
- name: PADDLE_CURRENT_IP
73+
valueFrom:
74+
fieldRef:
75+
apiVersion: v1
76+
fieldPath: status.podIP
77+
- name: PADDLE_JOB_NAME
78+
value: paddle-rec
79+
- name: PADDLE_IS_LOCAL
80+
value: "0"
81+
- name: PADDLE_TRAINERS_NUM
82+
value: "<$ TRAINER_NUM $>"
83+
- name: PADDLE_PSERVERS_NUM
84+
value: "<$ PSERVER_NUM $>"
85+
- name: FLAGS_rpc_deadline
86+
value: "100000"
87+
- name: ENTRY
88+
value: python -m paddlerec.run -m /usr/paddlerec/config.yaml -r WORKER
89+
- name: PADDLE_PORT
90+
value: "30240"
91+
- name: PADDLE_TRAINING_ROLE
92+
value: PSERVER
93+
- name: TRAINING_ROLE
94+
value: PSERVER
95+
volumes:
96+
- name: model-config
97+
configMap:
98+
name: modelconfig
99+
defaultMode: 0777
100+
restartPolicy: OnFailure
101+
102+
- replicas: <$ TRAINER_NUM $>
103+
policies:
104+
- event: TaskCompleted
105+
action: CompleteJob
106+
name: trainer
107+
template:
108+
metadata:
109+
labels:
110+
paddle-job: paddle-rec
111+
spec:
112+
imagePullSecrets:
113+
- name: default-secret
114+
containers:
115+
- image: <$ IMAGE $>
116+
command:
117+
- '/bin/bash'
118+
args:
119+
- "-c"
120+
- |
121+
set -ex
122+
sh /usr/paddlerec/set_k8s_env.sh start_fluid
123+
imagePullPolicy: Always
124+
volumeMounts:
125+
- name: model-config
126+
mountPath: "/usr/paddlerec"
127+
name: trainer
128+
resources:
129+
limits:
130+
cpu: <$ CPU_NUM $>
131+
memory: <$ MEMORY $>
132+
ephemeral-storage: <$ STORAGE $>
133+
requests:
134+
cpu: 1
135+
memory: 1Gi
136+
ephemeral-storage: 1Gi
137+
env:
138+
- name: GLOG_v
139+
value: "<$ GLOG_V $>"
140+
- name: GLOG_logtostderr
141+
value: "1"
142+
- name: TRAINER_PACKAGE
143+
value: /root/paddlejob
144+
- name: PADDLE_INIT_NICS
145+
value: eth2
146+
- name: NAMESPACE
147+
valueFrom:
148+
fieldRef:
149+
apiVersion: v1
150+
fieldPath: metadata.namespace
151+
- name: POD_IP
152+
valueFrom:
153+
fieldRef:
154+
apiVersion: v1
155+
fieldPath: status.podIP
156+
- name: POD_NAME
157+
valueFrom:
158+
fieldRef:
159+
apiVersion: v1
160+
fieldPath: metadata.name
161+
- name: PADDLE_CURRENT_IP
162+
valueFrom:
163+
fieldRef:
164+
apiVersion: v1
165+
fieldPath: status.podIP
166+
- name: PADDLE_JOB_NAME
167+
value: paddle-rec
168+
- name: PADDLE_IS_LOCAL
169+
value: "0"
170+
- name: FLAGS_rpc_deadline
171+
value: "3600"
172+
- name: PADDLE_PORT
173+
value: "30240"
174+
- name: PADDLE_PSERVERS_NUM
175+
value: "<$ PSERVER_NUM $>"
176+
- name: PADDLE_TRAINERS_NUM
177+
value: "<$ TRAINER_NUM $>"
178+
- name: PADDLE_TRAINING_ROLE
179+
value: TRAINER
180+
- name: TRAINING_ROLE
181+
value: TRAINER
182+
- name: ENTRY
183+
value: python -m paddlerec.run -m /usr/paddlerec/config.yaml -r WORKER
184+
volumes:
185+
- name: model-config
186+
configMap:
187+
name: modelconfig
188+
defaultMode: 0777
189+
restartPolicy: OnFailure
190+
191+
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/bin/bash
2+
set -x
3+
4+
check_failed_cnt() {
5+
max_failed=$1
6+
failed_count=$(python -m paddlerec.tools.k8s_tools count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed)
7+
if [ $failed_count -gt $max_failed ]; then
8+
stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed
9+
echo "Failed trainer count beyond the threshold: " $max_failed >/dev/termination-log
10+
exit 0
11+
fi
12+
}
13+
14+
check_trainer_ret() {
15+
ret=$1
16+
stdbuf -oL echo "job returned $ret...setting pod return message..."
17+
stdbuf -oL echo "==============================="
18+
19+
if [ $ret -eq 136 ]; then
20+
echo "Error Arithmetic Operation(Floating Point Exception)" >/dev/termination-log
21+
elif [ $ret -eq 139 ]; then
22+
echo "Segmentation Fault" >/dev/termination-log
23+
elif [ $ret -eq 1 ]; then
24+
echo "General Error" >/dev/termination-log
25+
elif [ $ret -eq 134 ]; then
26+
echo "Program Abort" >/dev/termination-log
27+
fi
28+
stdbuf -oL echo "termination log wroted..."
29+
exit $ret
30+
}
31+
32+
start_fluid_process() {
33+
pserver_label="paddle-job-pserver=${PADDLE_JOB_NAME}"
34+
trainer_label="paddle-job=${PADDLE_JOB_NAME}"
35+
hostname=${HOSTNAME}
36+
task_index=""
37+
38+
if [ "${PADDLE_TRAINING_ROLE}" == "TRAINER" ] || [ "${PADDLE_TRAINING_ROLE}" == "PSERVER" ]; then
39+
stdbuf -oL python -m paddlerec.tools.k8s_tools wait_pods_running ${pserver_label} ${PADDLE_PSERVERS_NUM}
40+
fi
41+
42+
export PADDLE_PSERVERS_IP_PORT_LIST=$(python -m paddlerec.tools.k8s_tools fetch_endpoints ${pserver_label} ${PADDLE_PORT})
43+
export PADDLE_TRAINER_IPS=$(python -m paddlerec.tools.k8s_tools fetch_ips ${trainer_label})
44+
45+
if [ "${PADDLE_TRAINING_ROLE}" == "TRAINER" ]; then
46+
check_failed_cnt 1
47+
task_index=$(python -m paddlerec.tools.k8s_tools fetch_id ${trainer_label})
48+
else
49+
task_index=$(python -m paddlerec.tools.k8s_tools fetch_id ${pserver_label})
50+
fi
51+
52+
export PADDLE_TRAINER_ID=${task_index}
53+
export PADDLE_PSERVER_ID=${task_index}
54+
55+
stdbuf -oL sh -c "${ENTRY}"
56+
check_trainer_ret $?
57+
}
58+
59+
usage() {
60+
echo "usage: paddle_k8s [<args>]:"
61+
echo " start_fluid Start paddle fluid distributed training, set env"
62+
}
63+
64+
case "$1" in
65+
start_fluid)
66+
start_fluid_process
67+
;;
68+
--help)
69+
usage
70+
;;
71+
*)
72+
usage
73+
;;
74+
esac

0 commit comments

Comments
 (0)