Merge pull request #225 from panchul/gpu_misc

initmahesh · web-flow · commit a84177626a69 · 2020-11-17T10:08:39.000-08:00
Updating container references, fixing docs to more portable links
diff --git a/edge_k8s_gpu_sharing/deploy_infer.yaml b/edge_k8s_gpu_sharing/deploy_infer.yaml
@@ -22,6 +22,7 @@ spec:
     spec:
       containers:
       - name: my-infer
+        # !!! put your own image location instead
         image: myregistry.azurecr.io/rollingstone/myinfer:1.0
         ports:
         # we use only 5001, but the container exposes  EXPOSE 5001 8883 8888
@@ -37,5 +38,5 @@ spec:
             # memory: "128Mi" #128 MB
             # cpu: "200m" # 200 millicpu (0.2 or 20% of the cpu)
             nvidia.com/gpu:  1
-      imagePullSecrets:
-        - name: secret4acr2infer
+      #imagePullSecrets:
+      #  - name: secret4acr2infer
diff --git a/edge_k8s_gpu_sharing/deploy_infer2.yaml b/edge_k8s_gpu_sharing/deploy_infer2.yaml
@@ -24,6 +24,7 @@ spec:
     spec:
       containers:
       - name: my-infer2
+        # !!! put your own image location instead
         image: myregistry.azurecr.io/rollingstone/myinfer:1.0
         env:
         - name: NVIDIA_VISIBLE_DEVICES
@@ -38,5 +39,5 @@ spec:
           limits:
             # not using gpu allocation via `limits`, using NVIDIA_VISIBLE_DEVICES env.
             # nvidia.com/gpu:  1
-      imagePullSecrets:
-        - name: secret4acr2infer
+      #imagePullSecrets:
+      #  - name: secret4acr2infer
diff --git a/edge_k8s_gpu_sharing/deploy_infer3.yaml b/edge_k8s_gpu_sharing/deploy_infer3.yaml
@@ -24,6 +24,7 @@ spec:
     spec:
       containers:
       - name: my-infer3
+        # !!! put your own image location instead
         image: myregistry.azurecr.io/rollingstone/myinfer:1.0
         env:
         - name: NVIDIA_VISIBLE_DEVICES
@@ -38,5 +39,5 @@ spec:
           limits:
             # not using gpu allocation via `limits`, using NVIDIA_VISIBLE_DEVICES env.
             # nvidia.com/gpu:  1
-      imagePullSecrets:
-        - name: secret4acr2infer
+      #imagePullSecrets:
+      #  - name: secret4acr2infer
diff --git a/edge_k8s_gpu_sharing/deploy_infer_GPU_GREED.yaml b/edge_k8s_gpu_sharing/deploy_infer_GPU_GREED.yaml
@@ -7,21 +7,22 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: my-infer-GPU_GREED
+  name: my-infer-gpugreed
   labels:
-    app: my-infer-GPU_GREED
+    app: my-infer-gpugreed
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: my-infer-GPU_GREED
+      app: my-infer-gpugreed
   template:
     metadata:
       labels:
-        app: my-infer-GPU_GREED
+        app: my-infer-gpugreed
     spec:
       containers:
-      - name: my-infer-GPU_GREED
+      - name: my-infer-gpugreed
+        # !!! put your own image location instead
         image: myregistry.azurecr.io/rollingstone/myinfer:1.0
         ports:
         # we use only 5001, but the container exposes  EXPOSE 5001 8883 8888
@@ -37,5 +38,5 @@ spec:
             # memory: "128Mi" #128 MB
             # cpu: "200m" # 200 millicpu (0.2 or 20% of the cpu)
             nvidia.com/gpu:  100
-      imagePullSecrets:
-        - name: secret4acr2infer
+      #imagePullSecrets:
+      #  - name: secret4acr2infer
diff --git a/edge_k8s_gpu_sharing/kubernetes_gpu_sharing_edge.md b/edge_k8s_gpu_sharing/kubernetes_gpu_sharing_edge.md
@@ -48,7 +48,7 @@ in a deployment .yaml these entries would request an allocation of one gpu devic
                   nvidia.com/gpu: 1
     ...
 
-To damonstrate this, let's deploy one of our previous models from [machine-learning-notebooks/deploying-on-k8s](../machine-learning-notebooks/deploying-on-k8s),
+To damonstrate this, let's deploy one of our previous models from [machine-learning-notebooks/deploying-on-k8s](../machine-learning-notebooks/deploying-on-k8s/Readme.md),
 you will need to run this notebook to create the container image: [machine-learning-notebooks/deploying-on-k8s/production-deploy-to-k8s-gpu.ipynb](../machine-learning-notebooks/deploying-on-k8s/production-deploy-to-k8s-gpu.ipynb).
 
 `deploy_infer.yaml` will look like this:
@@ -366,5 +366,4 @@ To clean the environment from what we created, we need to delete the deployments
 # Links
 
   - https://docs.microsoft.com/en-us/azure/databox-online/azure-stack-edge-gpu-connect-powershell-interface#view-gpu-driver-information
-  
   - https://nvidia.github.io/gpu-operator/
diff --git a/edge_k8s_gpu_sharing/kubernetes_gpu_sharing_one_node.md b/edge_k8s_gpu_sharing/kubernetes_gpu_sharing_one_node.md
@@ -4,10 +4,19 @@ This demo shows how to deploy multiple gpu-requiring workloads on a cluster with
 
 ## Pre-requisites
 
-Please follow the instructions in [Deploying model to Kubernetes](../deploying-on-k8s/README.md)
+To create a one-node gpu-capable Kubernetes cluster, you need a gpu-capable VM. During creation of the
+VMs, you need to specify a GPU-capable VM Size(either at Portal, or in your deployment template).
+
+Please follow the instructions in [Deploying model to Kubernetes](../machine-learning-notebooks/deploying-on-k8s/Readme.md)
 to make sure you have a GPU-capable node on your vm.
 
-Please see [NVIDIA webpage](https://docs.nvidia.com/datacenter/kubernetes/kubernetes-upstream/index.html#kubernetes-run-a-workload) if you have any problems. You should be able to run nvidia-smi:
+If you need to install docker, follow the instructions at [Nvidia cloud native containers](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
+
+And if you need to install the drivers, see [Azure VM driver setup](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/n-series-driver-setup) or related. You might have to upgrade your system and/or drivers to work.
+
+Please see [NVIDIA webpage](https://docs.nvidia.com/datacenter/kubernetes/kubernetes-upstream/index.html#kubernetes-run-a-workload) if you have any problems.
+
+Before moving forward, you should be able to run nvidia-smi:
 
     $ sudo docker run --rm --runtime=nvidia nvidia/cuda nvidia-smi
     +-----------------------------------------------------------------------------+
@@ -27,7 +36,7 @@ Please see [NVIDIA webpage](https://docs.nvidia.com/datacenter/kubernetes/kubern
     |  No running processes found                                                 |
     +-----------------------------------------------------------------------------+
 
-Once you installed `microk8s` as in our demo [Deploying model to Kubernetes](../machine-learning-notebooks/deploying-on-k8s/README.md),
+Once you installed `microk8s` as in our demo [Deploying model to Kubernetes](../machine-learning-notebooks/deploying-on-k8s/Readme.md),
 you should also be able to see `nvidia-smi` from within a pod:
 
     $ kubectl exec -it gpu-pod nvidia-smi
@@ -67,7 +76,7 @@ in a deployment .yaml these entries would request an allocation of one gpu devic
                   nvidia.com/gpu: 1
     ...
 
-To damonstrate this, let's deploy one of our previous models from [machine-learning-notebooks/deploying-on-k8s](../machine-learning-notebooks/deploying-on-k8s),
+To damonstrate this, let's deploy one of our previous models from [machine-learning-notebooks/deploying-on-k8s](../machine-learning-notebooks/deploying-on-k8s/Readme.md),
 you will need to run this notebook to create the container image: [machine-learning-notebooks/deploying-on-k8s/production-deploy-to-k8s-gpu.ipynb](../machine-learning-notebooks/deploying-on-k8s/production-deploy-to-k8s-gpu.ipynb).
 
 `deploy_infer.yaml` will look like this:
@@ -153,6 +162,11 @@ indicate insufficient resource:
     Warning  FailedScheduling  <unknown>  default-scheduler  0/2 nodes are available: 100 Insufficient nvidia.com/gpu.
     ...
 
+    $ kubectl get pods -n myasetest1
+    NAMESPACE      NAME                                         READY   STATUS      RESTARTS   AGE
+    myasetest1     my-infer-f79869b88-vfbnx                     1/1     Running     0          41m
+    myasetest1     my-infer-gpugreed-5c88f68f6b-c9gd5           0/1     Pending     0          9m
+
 You can delete it like so:
 
     $ kubectl delete -f deploy_infer_GPU_GREED.yaml -n myasetest1
@@ -376,4 +390,4 @@ To clean the environment from what we created, we need to delete the deployments
   - https://docs.microsoft.com/en-us/azure/databox-online/azure-stack-edge-gpu-connect-powershell-interface#view-gpu-driver-information
   - https://nvidia.github.io/gpu-operator/
   - https://github.com/NVIDIA/k8s-device-plugin/blob/examples/workloads/pod.yml
-  - [Deploying model to Kubernetes](../machine-learning-notebooks/deploying-on-k8s/README.md)
+  - [Deploying model to Kubernetes](../machine-learning-notebooks/deploying-on-k8s/Readme.md)