configure separate GPU support for management and common workflow kscv's

ksatzke · ksatzke · commit 9fba58372564 · 2020-08-21T11:51:58.000Z
diff --git a/ManagementService/python/deployWorkflow.py b/ManagementService/python/deployWorkflow.py
@@ -301,6 +301,19 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False):
         kservice['spec']['template']['spec']['volumes'] = [{ 'name': 'new-workflow-conf', 'configMap': {'name': new_workflow_conf['configmap']}}]
         kservice['spec']['template']['spec']['containers'][0]['volumeMounts'] = [{'name': 'new-workflow-conf', 'mountPath': '/opt/mfn/SandboxAgent/conf'}]
         kservice['spec']['template']['spec']['serviceAccountName'] = new_workflow_conf['mgmtserviceaccount']
+
+        # management container should not consume a CPU
+        #kservice['spec']['template']['spec']['containers'][0]['image'] = new_workflow_conf['image.Python']
+        #if ("nvidia.com/gpu" in kservice['spec']['template']['spec']['containers'][0]['resources']['limits'].keys()): 
+        # overwrite limits entry, generate new k/v pair
+        #print("RESOURCES: " + str(kservice['spec']['template']['spec']['containers'][0]['resources'])) # just testin...
+        #print("RESOURCES: " + str(kservice['spec']['template']['spec']['containers'][0]['resources']['limits'])) # just testin...
+        if (labels['workflowid'] == "Management"):
+            kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = "0"
+            kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = "0"
+        #kservice['spec']['template']['spec']['containers'][0]['resources']['limits'] = {{"cpu": 1, "memory": "2Gi"}, "requests": {"cpu": 1, "memory": "1Gi"}}       
+        #kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = 0 
+
         if 'HTTP_GATEWAYPORT' in new_workflow_conf:
             env.append({'name': 'HTTP_GATEWAYPORT', 'value': new_workflow_conf['HTTP_GATEWAYPORT']})
         if 'HTTPS_GATEWAYPORT' in new_workflow_conf:
diff --git a/Sandbox/Dockerfile b/Sandbox/Dockerfile
@@ -12,19 +12,14 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-#FROM ubuntu:18.04
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+FROM ubuntu:18.04
 
 # Install (as root)
 # Base
 RUN apt-get update --fix-missing
 RUN apt-get -y --no-install-recommends install build-essential
 RUN apt-get -y --no-install-recommends install netbase unzip file libmagic1
 
-# CUDA 10.1 dependencies and tools to build dlib 
-RUN apt-get -y --no-install-recommends install libsm6 libxrender1 libxrender-dev libxext6 libglib2.0-0 git cmake
-RUN apt-get install -y --no-install-recommends libnvinfer6=6.0.1-1+cuda10.1 libnvinfer-dev=6.0.1-1+cuda10.1 libnvinfer-plugin6=6.0.1-1+cuda10.1
-
 # Python
 RUN apt-get -y --no-install-recommends install python3 python3-dev
 RUN apt-get -y --no-install-recommends install python3-pip
@@ -43,19 +38,6 @@ RUN /usr/bin/python3 -m pip install fastcache
 # Needed for multi-language support (currently just Java)
 RUN /usr/bin/python3 -m pip install thriftpy2
 
-# Install dlib for CUDA
-RUN git clone https://github.com/davisking/dlib.git
-RUN mkdir -p /dlib/build
-
-RUN cmake -H/dlib -B/dlib/build -DDLIB_USE_CUDA=1 -DUSE_AVX_INSTRUCTIONS=1
-RUN cmake --build /dlib/build
-
-RUN cd /dlib; python3 /dlib/setup.py install
-
-# Install the face recognition package and tensorflow
-RUN pip3 install face_recognition
-RUN pip3 install tensorflow==2.1.0
-
 # Java (for queue service)
 RUN apt-get -y --no-install-recommends install openjdk-8-jdk-headless
 
diff --git a/Sandbox/Dockerfile.cpu b/Sandbox/Dockerfile.cpu
diff --git a/deploy/helm/Makefile b/deploy/helm/Makefile
@@ -41,4 +41,4 @@ push:
 	make -C ../../GUI push
 
 deploy: push
-	helm install --name mfn microfunctions/
+	helm install mfn microfunctions/
diff --git a/deploy/helm/microfunctions/values.yaml b/deploy/helm/microfunctions/values.yaml
@@ -19,7 +19,8 @@
 #------------------------------------------------------------------------------
 # MicroFunction management workflow
 #------------------------------------------------------------------------------
-imageRepo: "registry.kube-system.svc.cluster.local"
+#imageRepo: "registry.kube-system.svc.cluster.local"
+imageRepo: "localhost:5000"
 manager:
   #httpProxy: "http://<host>:<port>"
   #httpsProxy: "http://<host>:<port>"
@@ -64,7 +65,7 @@ manager:
 # MicroFunction Datalayer
 #------------------------------------------------------------------------------
 datalayer:
-  replicas: 3
+  replicas: 1
   imagePath: "/microfn/datalayer"
   imageTag: "latest"
   imagePullPolicy: "Always"
@@ -82,7 +83,7 @@ datalayer:
 # Riak global data storage
 #------------------------------------------------------------------------------
 riak:
-  replicas: 3
+  replicas: 1
   imagePath: "/microfn/riak"
   imageTag: "latest"
   imagePullPolicy: "Always"
diff --git a/tests/mfn_test_utils.py b/tests/mfn_test_utils.py
@@ -326,10 +326,10 @@ def get_test_workflow_endpoints(self):
         if self._workflow.status == "deployed":
             return self._workflow.endpoints
 
-    def execute(self, message, timeout=None, check_duration=False, async=False):
+    def execute(self, message, timeout=None, check_duration=False, async_=False):
         if timeout is None:
             timeout = self._settings["timeout"]
-        if async:
+        if async_:
             return self._workflow.execute_async(message, timeout)
         else:
             return self._workflow.execute(message, timeout, check_duration)
@@ -365,7 +365,7 @@ def exec_only(self, inp):
             if any_failed_tests:
                 self._print_logs(self._workflow.logs())
 
-    def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False, should_undeploy=True, async=False):
+    def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False, should_undeploy=True, async_=False):
         any_failed_tests = False
         durations = []
 
@@ -378,7 +378,7 @@ def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False,
                 if check_duration:
                     rn, t_total = self.execute(json.loads(inp), check_duration=check_duration)
                 else:
-                    rn = self.execute(json.loads(inp), async=async)
+                    rn = self.execute(json.loads(inp), async_=async_)
 
                 if check_duration:
                     durations.append(t_total)
@@ -390,7 +390,7 @@ def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False,
                 res_to_check = []
 
                 # hold on to the Execution object, so that we can retrieve more results if needed
-                if async:
+                if async_:
                     rn_async = rn
 
                     if not isinstance(res, list):
@@ -404,7 +404,7 @@ def exec_tests(self, testtuplelist, check_just_keys=False, check_duration=False,
                 for cur_res in res_to_check:
                     # before we can compare results, we need to ensure that we get the actual result
                     # if we executed asynchronously, we'll have to wait until we get the result
-                    if async:
+                    if async_:
                         rn = rn_async.get()
 
                     if check_just_keys: