Skip to content

Commit aa7da07

Browse files
Add the implementation of train | evaluate | predict command in elasticdl_client. (#2089)
* Add the version for the docker python package. * Add the method declaration for the argument parser. * Add the arguments for train|predict|evaluate * Copy common modules from elasticdl main package to elasticdl_client package. * Call _submit_job in train api * Add k8s_resource module. * Set default value for extra_pypi_index arguments. * Update the default base image value to be python:3.7 * Update the argument description. * Make --image arguments to be required * Add the implementation of evaluate and predict * Update the default base image to be python:3.6 * Remove clean command. * Add comments. * Rename model_utils to module_utils * Simplify the function in k8s_client in elasticdl_client package. * Remove unnecessary event_cb in k8s_client in elasticdl_client package.
1 parent e948eec commit aa7da07

File tree

10 files changed

+1280
-16
lines changed

10 files changed

+1280
-16
lines changed

elasticdl/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
grpcio-tools==1.29.0
22
kubernetes==10.1.0
3-
docker
3+
docker==4.2.1
44
pyrecordio>=0.0.6
55
Cython
66
odps

elasticdl_client/api.py

Lines changed: 153 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,18 @@
1717
import docker
1818
from jinja2 import Template
1919

20+
from elasticdl_client.common import k8s_client as k8s
21+
from elasticdl_client.common.args import (
22+
build_arguments_from_parsed_result,
23+
parse_envs,
24+
wrap_python_args_with_string,
25+
)
26+
from elasticdl_client.common.constants import BashCommandTemplate
27+
from elasticdl_client.common.log_utils import default_logger as logger
28+
2029

2130
def init_zoo(args):
22-
print("Create the Dockerfile for the model zoo.")
31+
logger.info("Create the Dockerfile for the model zoo.")
2332

2433
# Copy cluster spec file to the current directory if specified
2534
cluster_spec_path = args.cluster_spec
@@ -37,16 +46,15 @@ def init_zoo(args):
3746
tmpl_str = """\
3847
FROM {{ BASE_IMAGE }} as base
3948
40-
RUN pip install elasticdl_preprocessing
41-
RUN pip install elasticdl
49+
RUN pip install elasticdl_preprocessing\
50+
--extra-index-url={{ EXTRA_PYPI_INDEX }}
51+
52+
RUN pip install elasticdl --extra-index-url={{ EXTRA_PYPI_INDEX }}
53+
ENV PATH /usr/local/lib/python3.6/dist-packages/elasticdl/go/bin:$PATH
4254
4355
COPY . /model_zoo
44-
{% if EXTRA_PYPI_INDEX %}
45-
RUN pip install -r /model_zoo/requirements.txt\
46-
--extra-index-url={{ EXTRA_PYPI_INDEX }}\
47-
{% else %}\
4856
RUN pip install -r /model_zoo/requirements.txt\
49-
{% endif %}
57+
--extra-index-url={{ EXTRA_PYPI_INDEX }}
5058
5159
{% if CLUSTER_SPEC_NAME %}\
5260
COPY ./{{ CLUSTER_SPEC_NAME }} /cluster_spec/{{ CLUSTER_SPEC_NAME }}\
@@ -59,12 +67,12 @@ def init_zoo(args):
5967
CLUSTER_SPEC_NAME=cluster_spec_name,
6068
)
6169

62-
with open("./Dockerfile", mode="w+") as f:
70+
with open("./Dockerfile", mode="w") as f:
6371
f.write(docker_file_content)
6472

6573

6674
def build_zoo(args):
67-
print("Build the image for the model zoo.")
75+
logger.info("Build the image for the model zoo.")
6876
# Call docker api to build the image
6977
# Validate the image name schema
7078
client = _get_docker_client(
@@ -83,7 +91,7 @@ def build_zoo(args):
8391

8492

8593
def push_zoo(args):
86-
print("Push the image for the model zoo.")
94+
logger.info("Push the image for the model zoo.")
8795
# Call docker api to push the image to remote registry
8896
client = _get_docker_client(
8997
docker_base_url=args.docker_base_url,
@@ -95,6 +103,140 @@ def push_zoo(args):
95103
_print_docker_progress(line)
96104

97105

106+
def train(args):
107+
container_args = [
108+
"--worker_image",
109+
args.image,
110+
"--model_zoo",
111+
args.model_zoo,
112+
"--cluster_spec",
113+
args.cluster_spec,
114+
]
115+
116+
container_args.extend(
117+
build_arguments_from_parsed_result(
118+
args,
119+
filter_args=[
120+
"model_zoo",
121+
"cluster_spec",
122+
"worker_image",
123+
"force_use_kube_config_file",
124+
"func",
125+
],
126+
)
127+
)
128+
129+
_submit_job(args.image, args, container_args)
130+
131+
132+
def evaluate(args):
133+
container_args = [
134+
"--worker_image",
135+
args.image,
136+
"--model_zoo",
137+
args.model_zoo,
138+
"--cluster_spec",
139+
args.cluster_spec,
140+
]
141+
container_args.extend(
142+
build_arguments_from_parsed_result(
143+
args,
144+
filter_args=[
145+
"model_zoo",
146+
"cluster_spec",
147+
"worker_image",
148+
"force_use_kube_config_file",
149+
"func",
150+
],
151+
)
152+
)
153+
154+
_submit_job(args.image, args, container_args)
155+
156+
157+
def predict(args):
158+
container_args = [
159+
"--worker_image",
160+
args.image,
161+
"--model_zoo",
162+
args.model_zoo,
163+
"--cluster_spec",
164+
args.cluster_spec,
165+
]
166+
167+
container_args.extend(
168+
build_arguments_from_parsed_result(
169+
args,
170+
filter_args=[
171+
"model_zoo",
172+
"cluster_spec",
173+
"worker_image",
174+
"force_use_kube_config_file",
175+
],
176+
)
177+
)
178+
179+
_submit_job(args.image, args, container_args)
180+
181+
182+
def _submit_job(image_name, client_args, container_args):
183+
client = k8s.Client(
184+
image_name=image_name,
185+
namespace=client_args.namespace,
186+
job_name=client_args.job_name,
187+
cluster_spec=client_args.cluster_spec,
188+
force_use_kube_config_file=client_args.force_use_kube_config_file,
189+
)
190+
191+
container_args = wrap_python_args_with_string(container_args)
192+
193+
master_client_command = (
194+
BashCommandTemplate.SET_PIPEFAIL
195+
+ " python -m elasticdl.python.master.main"
196+
)
197+
container_args.insert(0, master_client_command)
198+
if client_args.log_file_path:
199+
container_args.append(
200+
BashCommandTemplate.REDIRECTION.format(client_args.log_file_path)
201+
)
202+
203+
python_command = " ".join(container_args)
204+
container_args = ["-c", python_command]
205+
206+
if client_args.yaml:
207+
client.dump_master_yaml(
208+
resource_requests=client_args.master_resource_request,
209+
resource_limits=client_args.master_resource_limit,
210+
args=container_args,
211+
pod_priority=client_args.master_pod_priority,
212+
image_pull_policy=client_args.image_pull_policy,
213+
restart_policy=client_args.restart_policy,
214+
volume=client_args.volume,
215+
envs=parse_envs(client_args.envs),
216+
yaml=client_args.yaml,
217+
)
218+
logger.info(
219+
"ElasticDL job %s YAML has been dumped into file %s."
220+
% (client_args.job_name, client_args.yaml)
221+
)
222+
else:
223+
client.create_master(
224+
resource_requests=client_args.master_resource_request,
225+
resource_limits=client_args.master_resource_limit,
226+
args=container_args,
227+
pod_priority=client_args.master_pod_priority,
228+
image_pull_policy=client_args.image_pull_policy,
229+
restart_policy=client_args.restart_policy,
230+
volume=client_args.volume,
231+
envs=parse_envs(client_args.envs),
232+
)
233+
logger.info(
234+
"ElasticDL job %s was successfully submitted. "
235+
"The master pod is: %s."
236+
% (client_args.job_name, client.get_master_pod_name())
237+
)
238+
239+
98240
def _get_docker_client(docker_base_url, docker_tlscert, docker_tlskey):
99241
if docker_tlscert and docker_tlskey:
100242
tls_config = docker.tls.TLSConfig(

0 commit comments

Comments
 (0)