-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathvalues.yaml
More file actions
189 lines (183 loc) · 7.49 KB
/
values.yaml
File metadata and controls
189 lines (183 loc) · 7.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
controllerManager:
## tolerations: add tolerations to the controller manager pod
## Example below is for a system-cpu tainted node
# tolerations:
# - key: dedicated
# operator: "Equal"
# value: system-cpu
# effect: NoSchedule
tolerations: []
## selectors: add simple node selectors to the controller manager pod
## Example below is for a system-workload node selector
## NOTE: Cannot be used together with nodeAffinity.matchExpressions
# selectors:
# dedicated: system-workload
selectors: {}
## nodeAffinity: add advanced node affinity expressions to the controller manager pod
## This allows for more complex node selection than simple selectors
## NOTE: Cannot be used together with selectors - choose one approach
## Example below shows how to select nodes with specific labels using expressions
# nodeAffinity:
# matchExpressions:
# - key: node-role.kubernetes.io/control-plane
# operator: DoesNotExist
# - key: dedicated
# operator: In
# values:
# - system-workload
# - gpu-workload
nodeAffinity:
matchExpressions: []
## config for kube-rbac-proxy used for webhooks
kubeRbacProxy:
args:
- --secure-listen-address=0.0.0.0:8443
- --upstream=http://127.0.0.1:8080/
- --logtostderr=true
- --v=0
containerSecurityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
image:
repository: quay.io/brancz/kube-rbac-proxy
tag: v0.15.0
digest: "sha256:2c7b120590cbe9f634f5099f2cbb91d0b668569023a81505ca124a5c437e7663" # manifest list digest (multi-arch)
resources:
limits:
cpu: 500m
memory: 128Mi
requests:
cpu: 5m
memory: 64Mi
manager:
containerSecurityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
env:
## copyDirRoot is the directory for which the operator will work from on the host.
## Some environments may require this to be set to a specific directory.
copyDirRoot: /var/lib/skyhook
## agentLogRoot is the directory for which the agent will write logs.
## Some environments may require this to be set to a specific directory.
agentLogRoot: /var/log/skyhook
## leaderElection: "true" will enable leader election for the operator controller
## Default is "true" and is required for production.
leaderElection: "true"
## logLevel: "info" is the log level for the operator controller.
## If you want more or less logs, change this value to "debug" or "error".
logLevel: info
metricsPort: :8080
probePort: :8081
## reapplyOnReboot: "true" will reapply the packages on reboot. This is useful for systems that are read-only.
reapplyOnReboot: "false"
## runtimeRequiredTaint: This feature assumes nodes are added to the cluster with `--register-with-taints` kubelet flag.
## This taint is assume to be all new nodes, and skyhook pods will tolerate this taint, and remove it one the nodes packages are complete.
## NOTE: If your systems nodes have this taint make sure to add the toleration to the controllerManager.tolerations
runtimeRequiredTaint: skyhook.nvidia.com=runtime-required:NoSchedule
## puaseImage: is the image used for the pause container in the operator controller.
pauseImage: registry.k8s.io/pause:3.10
image:
repository: nvcr.io/nvidia/skyhook/operator
tag: "v0.12.0" ## if both tag and digest are omitted, defaults to the chart appVersion
digest: "sha256:ce79a9778fca453e54d58506c71c8ff6765b65d44a73fb167441ab851c108dc2" # manifest list digest (multi-arch)
## agentImage: is the image used for the agent container. This image is the default for this install, but can be overridden in the CR at package level.
agent:
repository: nvcr.io/nvidia/skyhook/agent
tag: "v6.4.0"
digest: "sha256:2bc0fe8c5c11130c843859dd0c8325e316bf4a9bb1d5883554c90a7a0574a771" # manifest list digest (multi-arch) - update after CI publishes images
# resources: If this is defined it will override the default calculation for resources
# from estimatedNodeCount and estimatedPackageCount. The below values are
# what will be calculated until nodes > 1000 and packages 1-2 or nodes > 500 and packages >= 4
# resources:
# limits:
# cpu: 1000m
# memory: 512Mi
# requests:
# cpu: 200m
# memory: 64Mi
## replicas: number of the operator controller instances to run. This should be set to 2 or more for production.
replicas: 2 ## more than 1 need to make sure leaderElection is on
## podDisruptionBudget: pod disruption budget for the operator controller, this should be left as is for production deployments.
podDisruptionBudget:
minAvailable: 1 ## need to be in sync with replicas, but less than
serviceAccount:
annotations: {}
kubernetesClusterDomain: cluster.local
metricsService:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/scheme: "http"
ports:
- name: metrics
port: 8443
targetPort: 8443
protocol: TCP
- name: metrics-http
port: 8080
targetPort: 8080
protocol: TCP
type: ClusterIP
webhookService:
ports:
- name: webhook
port: 443
protocol: TCP
targetPort: 9443
type: ClusterIP
rbac:
createSkyhookViewerRole: false
createSkyhookEditorRole: false
## imagePullSecret: is the secret used to pull the operator controller image, agent image, and package images.
imagePullSecret: ""
## useHostNetwork: Whether the Operator pods should use hostNetwork: true or false
useHostNetwork: false
## estimatedPackageCount: estimated number of packages to be installed on the cluster
## this is used to calculate the resources for the operator controller
estimatedPackageCount: 1
## estimatedNodeCount: estimated number of nodes in the cluster
## this is used to calculate the resources for the operator controller
estimatedNodeCount: 1
## webhook config
webhook:
## secretName: name of the secret to store the webhook certificate
secretName: webhook-cert
## serviceName: name of the service to expose the webhook
serviceName: skyhook-operator-webhook-service
## enable: "true" will enable the webhook setup in the operator controller.
## Default is "true" and is required for production.
enable: true
## uninstall image for cleaning up webhook resources
removalImage: bitnami/kubectl
removalTag: latest
removalDigest: "sha256:1bc359beb3ae3982591349df11db50b0917b0596e8bed8ab9cf0c8a84a3502d1"
## cleanup: Configuration for pre-delete cleanup jobs
cleanup:
## enabled: When true, automatically deletes all Skyhook and DeploymentPolicy
## resources before helm uninstall. Recommended to prevent orphaned CRs.
enabled: true
## jobTimeoutSeconds: Hard deadline for the entire cleanup job.
## The job will be killed if it exceeds this time.
jobTimeoutSeconds: 120
metrics:
addServiceAccountBinding: false
serviceAccountName: prometheus
serviceAccountNamespace: monitoring
## limitRange: is the limit range for the operator controller.
## This sets for all containers in the namespace.
## So if your package does not override the limits, these are what will be used.
## if you omit this, the we will not create a limit range.
## best practice on limits and requests is to make make the limits 2x the requests max.
limitRange:
default:
cpu: "500m"
memory: "512Mi"
defaultRequest:
cpu: "250m"
memory: "256Mi"