skyhook/chart/values.yaml at a655c3f1cb5fc3e3c775ffebdd98ac217739003d · NVIDIA/skyhook · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
controllerManager:
  ## tolerations: add tolerations to the controller manager pod
  ## Example below is for a system-cpu tainted node
  # tolerations:
  # - key: dedicated
  #   operator: "Equal"
  #   value: system-cpu
  #   effect: NoSchedule
  tolerations: []
  ## selectors: add simple node selectors to the controller manager pod
  ## Example below is for a system-workload node selector
  ## NOTE: Cannot be used together with nodeAffinity.matchExpressions
  # selectors:
  #   dedicated: system-workload
  selectors: {}
  ## nodeAffinity: add advanced node affinity expressions to the controller manager pod
  ## This allows for more complex node selection than simple selectors
  ## NOTE: Cannot be used together with selectors - choose one approach
  ## Example below shows how to select nodes with specific labels using expressions
  # nodeAffinity:
  #   matchExpressions:
  #   - key: node-role.kubernetes.io/control-plane
  #     operator: DoesNotExist
  #   - key: dedicated
  #     operator: In
  #     values:
  #     - system-workload
  #     - gpu-workload
  nodeAffinity:
    matchExpressions: []
  ## config for kube-rbac-proxy used for webhooks
  kubeRbacProxy:
    args:
    - --secure-listen-address=0.0.0.0:8443
    - --upstream=http://127.0.0.1:8080/
    - --logtostderr=true
    - --v=0
    containerSecurityContext:
      allowPrivilegeEscalation: false
      readOnlyRootFilesystem: true
      capabilities:
        drop:
        - ALL
    image:
      repository: quay.io/brancz/kube-rbac-proxy
      tag: v0.15.0
      digest: "sha256:2c7b120590cbe9f634f5099f2cbb91d0b668569023a81505ca124a5c437e7663" # manifest list digest (multi-arch)
    resources:
      limits:
        cpu: 500m
        memory: 128Mi
      requests:
        cpu: 5m
        memory: 64Mi
  manager:
    containerSecurityContext:
      allowPrivilegeEscalation: false
      readOnlyRootFilesystem: true
      capabilities:
        drop:
        - ALL
    env:
      ## copyDirRoot is the directory for which the operator will work from on the host.
      ## Some environments may require this to be set to a specific directory.
      copyDirRoot: /var/lib/skyhook
      ## agentLogRoot is the directory for which the agent will write logs.
      ## Some environments may require this to be set to a specific directory.
      agentLogRoot: /var/log/skyhook
      ## leaderElection: "true" will enable leader election for the operator controller
      ## Default is "true" and is required for production.
      leaderElection: "true"
      ## logLevel: "info" is the log level for the operator controller.
      ## If you want more or less logs, change this value to "debug" or "error".
      logLevel: info
      metricsPort: :8080
      probePort: :8081
      ## reapplyOnReboot: "true" will reapply the packages on reboot. This is useful for systems that are read-only.
      reapplyOnReboot: "false"
      ## runtimeRequiredTaint: This feature assumes nodes are added to the cluster with `--register-with-taints` kubelet flag.
      ## This taint is assume to be all new nodes, and skyhook pods will tolerate this taint, and remove it one the nodes packages are complete.
      ## NOTE: If your systems nodes have this taint make sure to add the toleration to the controllerManager.tolerations
      runtimeRequiredTaint: skyhook.nvidia.com=runtime-required:NoSchedule
      ## puaseImage: is the image used for the pause container in the operator controller.
      pauseImage: registry.k8s.io/pause:3.10
    image:
      repository: nvcr.io/nvidia/skyhook/operator
      tag: "v0.12.0" ## if both tag and digest are omitted, defaults to the chart appVersion
      digest: "sha256:ce79a9778fca453e54d58506c71c8ff6765b65d44a73fb167441ab851c108dc2" # manifest list digest (multi-arch)
    ## agentImage: is the image used for the agent container. This image is the default for this install, but can be overridden in the CR at package level.
    agent:
      repository: nvcr.io/nvidia/skyhook/agent
      tag: "v6.4.0"
      digest: "sha256:2bc0fe8c5c11130c843859dd0c8325e316bf4a9bb1d5883554c90a7a0574a771" # manifest list digest (multi-arch) - update after CI publishes images

    # resources: If this is defined it will override the default calculation for resources
    # from estimatedNodeCount and estimatedPackageCount. The below values are
    # what will be calculated until nodes > 1000 and packages 1-2 or nodes > 500 and packages >= 4
    # resources:
    #   limits:
    #     cpu: 1000m
    #     memory: 512Mi
    #   requests:
    #     cpu: 200m
    #     memory: 64Mi
  ## replicas: number of the operator controller instances to run. This should be set to 2 or more for production.
  replicas: 2 ## more than 1 need to make sure leaderElection is on
  ## podDisruptionBudget: pod disruption budget for the operator controller, this should be left as is for production deployments.
  podDisruptionBudget:
    minAvailable: 1 ## need to be in sync with replicas, but less than
  serviceAccount:
    annotations: {}
kubernetesClusterDomain: cluster.local
metricsService:
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "8080"
    prometheus.io/scheme: "http"
  ports:
  - name: metrics
    port: 8443
    targetPort: 8443
    protocol: TCP
  - name: metrics-http
    port: 8080
    targetPort: 8080
    protocol: TCP
  type: ClusterIP
webhookService:
  ports:
  - name: webhook
    port: 443
    protocol: TCP
    targetPort: 9443
  type: ClusterIP
rbac:
  createSkyhookViewerRole: false
  createSkyhookEditorRole: false
## imagePullSecret: is the secret used to pull the operator controller image, agent image, and package images.
imagePullSecret: ""
## useHostNetwork: Whether the Operator pods should use hostNetwork: true or false
useHostNetwork: false
## estimatedPackageCount: estimated number of packages to be installed on the cluster
## this is used to calculate the resources for the operator controller
estimatedPackageCount: 1
## estimatedNodeCount: estimated number of nodes in the cluster
## this is used to calculate the resources for the operator controller
estimatedNodeCount: 1

## webhook config
webhook:
  ## secretName: name of the secret to store the webhook certificate
  secretName: webhook-cert
  ## serviceName: name of the service to expose the webhook
  serviceName: skyhook-operator-webhook-service
  ## enable: "true" will enable the webhook setup in the operator controller.
  ## Default is "true" and is required for production.
  enable: true

  ## uninstall image for cleaning up webhook resources
  removalImage: bitnami/kubectl
  removalTag: latest
  removalDigest: "sha256:1bc359beb3ae3982591349df11db50b0917b0596e8bed8ab9cf0c8a84a3502d1"

## cleanup: Configuration for pre-delete cleanup jobs
cleanup:
  ## enabled: When true, automatically deletes all Skyhook and DeploymentPolicy
  ## resources before helm uninstall. Recommended to prevent orphaned CRs.
  enabled: true
  ## jobTimeoutSeconds: Hard deadline for the entire cleanup job.
  ## The job will be killed if it exceeds this time.
  jobTimeoutSeconds: 120

metrics:
  addServiceAccountBinding: false
  serviceAccountName: prometheus
  serviceAccountNamespace: monitoring

## limitRange: is the limit range for the operator controller.
## This sets for all containers in the namespace.
## So if your package does not override the limits, these are what will be used.
## if you omit this, the we will not create a limit range.
## best practice on limits and requests is to make make the limits 2x the requests max.
limitRange:
  default:
    cpu: "500m"
    memory: "512Mi"
  defaultRequest:
    cpu: "250m"
    memory: "256Mi"