Skip to content

Commit 9fba583

Browse files
author
ksatzke
committed
configure separate GPU support for management and common workflow kscv's
1 parent cdd0faf commit 9fba583

File tree

6 files changed

+25
-86
lines changed

6 files changed

+25
-86
lines changed

ManagementService/python/deployWorkflow.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,19 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False):
301301
kservice['spec']['template']['spec']['volumes'] = [{ 'name': 'new-workflow-conf', 'configMap': {'name': new_workflow_conf['configmap']}}]
302302
kservice['spec']['template']['spec']['containers'][0]['volumeMounts'] = [{'name': 'new-workflow-conf', 'mountPath': '/opt/mfn/SandboxAgent/conf'}]
303303
kservice['spec']['template']['spec']['serviceAccountName'] = new_workflow_conf['mgmtserviceaccount']
304+
305+
# management container should not consume a CPU
306+
#kservice['spec']['template']['spec']['containers'][0]['image'] = new_workflow_conf['image.Python']
307+
#if ("nvidia.com/gpu" in kservice['spec']['template']['spec']['containers'][0]['resources']['limits'].keys()):
308+
# overwrite limits entry, generate new k/v pair
309+
#print("RESOURCES: " + str(kservice['spec']['template']['spec']['containers'][0]['resources'])) # just testin...
310+
#print("RESOURCES: " + str(kservice['spec']['template']['spec']['containers'][0]['resources']['limits'])) # just testin...
311+
if (labels['workflowid'] == "Management"):
312+
kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = "0"
313+
kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = "0"
314+
#kservice['spec']['template']['spec']['containers'][0]['resources']['limits'] = {{"cpu": 1, "memory": "2Gi"}, "requests": {"cpu": 1, "memory": "1Gi"}}
315+
#kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = 0
316+
304317
if 'HTTP_GATEWAYPORT' in new_workflow_conf:
305318
env.append({'name': 'HTTP_GATEWAYPORT', 'value': new_workflow_conf['HTTP_GATEWAYPORT']})
306319
if 'HTTPS_GATEWAYPORT' in new_workflow_conf:

Sandbox/Dockerfile

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,14 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
#FROM ubuntu:18.04
16-
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
15+
FROM ubuntu:18.04
1716

1817
# Install (as root)
1918
# Base
2019
RUN apt-get update --fix-missing
2120
RUN apt-get -y --no-install-recommends install build-essential
2221
RUN apt-get -y --no-install-recommends install netbase unzip file libmagic1
2322

24-
# CUDA 10.1 dependencies and tools to build dlib
25-
RUN apt-get -y --no-install-recommends install libsm6 libxrender1 libxrender-dev libxext6 libglib2.0-0 git cmake
26-
RUN apt-get install -y --no-install-recommends libnvinfer6=6.0.1-1+cuda10.1 libnvinfer-dev=6.0.1-1+cuda10.1 libnvinfer-plugin6=6.0.1-1+cuda10.1
27-
2823
# Python
2924
RUN apt-get -y --no-install-recommends install python3 python3-dev
3025
RUN apt-get -y --no-install-recommends install python3-pip
@@ -43,19 +38,6 @@ RUN /usr/bin/python3 -m pip install fastcache
4338
# Needed for multi-language support (currently just Java)
4439
RUN /usr/bin/python3 -m pip install thriftpy2
4540

46-
# Install dlib for CUDA
47-
RUN git clone https://github.com/davisking/dlib.git
48-
RUN mkdir -p /dlib/build
49-
50-
RUN cmake -H/dlib -B/dlib/build -DDLIB_USE_CUDA=1 -DUSE_AVX_INSTRUCTIONS=1
51-
RUN cmake --build /dlib/build
52-
53-
RUN cd /dlib; python3 /dlib/setup.py install
54-
55-
# Install the face recognition package and tensorflow
56-
RUN pip3 install face_recognition
57-
RUN pip3 install tensorflow==2.1.0
58-
5941
# Java (for queue service)
6042
RUN apt-get -y --no-install-recommends install openjdk-8-jdk-headless
6143

Sandbox/Dockerfile.cpu

Lines changed: 0 additions & 57 deletions
This file was deleted.

deploy/helm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ push:
4141
make -C ../../GUI push
4242

4343
deploy: push
44-
helm install --name mfn microfunctions/
44+
helm install mfn microfunctions/

deploy/helm/microfunctions/values.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
#------------------------------------------------------------------------------
2020
# MicroFunction management workflow
2121
#------------------------------------------------------------------------------
22-
imageRepo: "registry.kube-system.svc.cluster.local"
22+
#imageRepo: "registry.kube-system.svc.cluster.local"
23+
imageRepo: "localhost:5000"
2324
manager:
2425
#httpProxy: "http://<host>:<port>"
2526
#httpsProxy: "http://<host>:<port>"
@@ -64,7 +65,7 @@ manager:
6465
# MicroFunction Datalayer
6566
#------------------------------------------------------------------------------
6667
datalayer:
67-
replicas: 3
68+
replicas: 1
6869
imagePath: "/microfn/datalayer"
6970
imageTag: "latest"
7071
imagePullPolicy: "Always"
@@ -82,7 +83,7 @@ datalayer:
8283
# Riak global data storage
8384
#------------------------------------------------------------------------------
8485
riak:
85-
replicas: 3
86+
replicas: 1
8687
imagePath: "/microfn/riak"
8788
imageTag: "latest"
8889
imagePullPolicy: "Always"

tests/mfn_test_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,10 +326,10 @@ def get_test_workflow_endpoints(self):
326326
if self._workflow.status == "deployed":
327327
return self._workflow.endpoints
328328

329-
def execute(self, message, timeout=None, check_duration=False, async=False):
329+
def execute(self, message, timeout=None, check_duration=False, async_=False):
330330
if timeout is None:
331331
timeout = self._settings["timeout"]
332-
if async:
332+
if async_:
333333
return self._workflow.execute_async(message, timeout)
334334
else:
335335
return self._workflow.execute(message, timeout, check_duration)
@@ -365,7 +365,7 @@ def exec_only(self, inp):
365365
if any_failed_tests:
366366
self._print_logs(self._workflow.logs())
367367

368-
def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False, should_undeploy=True, async=False):
368+
def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False, should_undeploy=True, async_=False):
369369
any_failed_tests = False
370370
durations = []
371371

@@ -378,7 +378,7 @@ def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False,
378378
if check_duration:
379379
rn, t_total = self.execute(json.loads(inp), check_duration=check_duration)
380380
else:
381-
rn = self.execute(json.loads(inp), async=async)
381+
rn = self.execute(json.loads(inp), async_=async_)
382382

383383
if check_duration:
384384
durations.append(t_total)
@@ -390,7 +390,7 @@ def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False,
390390
res_to_check = []
391391

392392
# hold on to the Execution object, so that we can retrieve more results if needed
393-
if async:
393+
if async_:
394394
rn_async = rn
395395

396396
if not isinstance(res, list):
@@ -404,7 +404,7 @@ def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False,
404404
for cur_res in res_to_check:
405405
# before we can compare results, we need to ensure that we get the actual result
406406
# if we executed asynchronously, we'll have to wait until we get the result
407-
if async:
407+
if async_:
408408
rn = rn_async.get()
409409

410410
if check_just_keys:

0 commit comments

Comments
 (0)