Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,31 @@ will give you 1.5 metagpus.
### [MetaGPU demo from Cnvrg's MLCon 2.0](https://www.youtube.com/watch?v=hsP9GXUtNNs)

### Deployment
1. clone the repo
2. use helm chart to install or dump manifest and install manually
1. Clone the repo
2. Use Helm chart to install (or alternatively use static manifest)

### Install with helm chart

Helm chart is located in `chart` directory.

Go through the `values.yaml` and adjust it to your setup following the comments.

Install the chart:
```bash
# cd into cloned directory and run
# for openshift set ocp=true
helm install chart --set ocp=false -ncnvrg
cp chart/values.yaml myvalues.yaml
# edit myvalues.yaml
helm install metagpu ./chart --values myvalues.yaml -n cnvrg
```

### Install with raw K8s manifests

Example of static all-in-one manifest file located in `deploy/static.yaml`.

You can adjust it to your needs and install with:
```bash
# cd into cloned directory and run
# for openshift set ocp=true
helm template chart --set ocp=false -ncnvrg > meatgpu.yaml
kubectl apply -f meatgpu.yaml
kubectl apply -f deploy/static.yaml
```


### Test the Metagpu
```bash
cat <<EOF | kubectl apply -f -
Expand All @@ -72,7 +78,7 @@ spec:
- name: gpu-test-with-gpu
image: tensorflow/tensorflow:latest-gpu
command:
- /usr/local/bin/python
- /usr/bin/python
- -c
- |
import tensorflow as tf
Expand Down
23 changes: 23 additions & 0 deletions chart/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
4 changes: 2 additions & 2 deletions chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: metagpu-device-plugin
description: Metagpu device plugin
type: application
version: 1.0.0
appVersion: 1.0.0
version: 2.0.0
appVersion: v0.0.9
11 changes: 11 additions & 0 deletions chart/templates/NOTES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@


MetaGPU Device Plugin has been deployed.

You can verify DaemonSet is up and running with:
kubectl get ds {{ include "metagpu.fullname" . }} --namespace {{ .Release.Namespace }}

You are now ready to request MetaGPUs specifying requests for the following resource names:
{{- range .Values.config.deviceSharing }}
- {{ .resourceName }}
{{- end }}
59 changes: 59 additions & 0 deletions chart/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "metagpu.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "metagpu.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "metagpu.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "metagpu.labels" -}}
helm.sh/chart: {{ include "metagpu.chart" . }}
{{ include "metagpu.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "metagpu.selectorLabels" -}}
app.kubernetes.io/name: {{ include "metagpu.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "metagpu.serviceAccountName" -}}
{{- default (include "metagpu.fullname" .) .Values.serviceAccount.name }}
{{- end }}
25 changes: 25 additions & 0 deletions chart/templates/cm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "metagpu.fullname" . }}-config
labels:
{{- include "metagpu.labels" . | nindent 4 }}
data:
config.yaml: |
accelerator: nvidia
processesDiscoveryPeriod: {{ .Values.config.deviceManager.processesDiscoveryPeriod | int }}
deviceCacheTTL: {{ .Values.config.deviceManager.deviceCacheTTL | int }}
mgctlTar: {{ .Values.config.mgctl.sourcePath }}
mgctlAutoInject: {{ .Values.config.mgctl.podExecCopy.enabled }}
mgctlMount: {{ .Values.config.mgctl.hostMount.enabled }}
mgctlMountHostPath: {{ .Values.config.mgctl.hostMount.hostPath }}
mgctlMountContainertPath: {{ .Values.config.mgctl.hostMount.containerPath }}
serverAddr: 0.0.0.0:50052
memoryEnforcer: {{ .Values.config.memoryEnforcer }}
jwtSecret: {{ .Values.config.grpcSecurity.jwtSecret }}
deviceToken: {{ .Values.config.grpcSecurity.deviceToken }}
containerToken: {{ .Values.config.grpcSecurity.containerToken }}
{{- with .Values.config.deviceSharing }}
deviceSharing:
{{- toYaml . | nindent 6 }}
{{- end }}
30 changes: 0 additions & 30 deletions chart/templates/cm.yml

This file was deleted.

109 changes: 109 additions & 0 deletions chart/templates/ds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "metagpu.fullname" . }}
labels:
{{- include "metagpu.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
{{- include "metagpu.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "metagpu.selectorLabels" . | nindent 8 }}
spec:
hostPID: true
hostNetwork: true
serviceAccountName: {{ include "metagpu.serviceAccountName" . }}
containers:
- name: metagpu-device-plugin
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- /bin/sh
- -c
- |
{{- if .Values.config.mgctl.hostMount.enabled }}
cp -f {{ .Values.config.mgctl.sourcePath }} /var/lib/metagpu/mgctl
{{- end }}
/usr/bin/mgdp start -c /etc/metagpu-device-plugin
ports:
- containerPort: 50052
name: grpc
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
env:
- name: METAGPU_DEVICE_PLUGIN_NODENAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: MG_CTL_TOKEN
value: {{ .Values.config.grpcSecurity.deviceToken | quote }}
{{- with .Values.extraEnv }}
{{- toYaml . | nindent 12 }}
{{- end }}
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: config
mountPath: /etc/metagpu-device-plugin
- name: proc
mountPath: /host/proc
mountPropagation: HostToContainer
readOnly: true
{{- if .Values.config.mgctl.hostMount.enabled }}
- name: mgctl
mountPath: /var/lib/metagpu
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.exporter.enabled }}
- name: metagpu-exporter
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- /usr/bin/mgex
- start
- -t
- {{ .Values.config.grpcSecurity.deviceToken | quote }}
ports:
- name: metrics
containerPort: 2112
resources:
{{- toYaml .Values.exporter.resources | nindent 12 }}
{{- end }}
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: config
configMap:
name: {{ include "metagpu.fullname" . }}-config
- name: proc
hostPath:
path: /proc
{{- if .Values.config.mgctl.hostMount.enabled }}
- name: mgctl
hostPath:
path: {{ .Values.config.mgctl.hostMount.hostPath }}
type: DirectoryOrCreate
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}


Loading