Skip to content

Commit 88ee5ea

Browse files
authored
Merge branch 'main' into iam-workers-crossplane
2 parents dcaf0db + 760cc03 commit 88ee5ea

8 files changed

+165
-53
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
- Add node-problem-detector-app, disabled by default.
1213
- Add Crossplane IAM Roles, policies and instance profiles for the worker nodes. Instead of having an IAM Role per node pool, now we'll use the same for all node pools. *This change will roll the worker nodes*.
1314

1415
### Changed

helm/cluster-aws/Chart.lock

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
dependencies:
22
- name: cluster
33
repository: https://giantswarm.github.io/cluster-catalog
4-
version: 4.3.0
4+
version: 4.4.0
55
- name: cluster-shared
66
repository: https://giantswarm.github.io/cluster-catalog
77
version: 0.7.1
8-
digest: sha256:98128b1252d1b43d0d6778546eded7d0be50da943a7d4e108cf95f581c0d5067
9-
generated: "2025-10-24T22:03:46.995588877Z"
8+
digest: sha256:5a5cf5f0c737e152a9196a47a8a59b573f81f7bb6b56326db2728ad917672f0d
9+
generated: "2025-10-28T10:02:18.817133+01:00"

helm/cluster-aws/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ restrictions:
1616
- capa
1717
dependencies:
1818
- name: cluster
19-
version: "4.3.0"
19+
version: "4.4.0"
2020
repository: https://giantswarm.github.io/cluster-catalog
2121
- name: cluster-shared
2222
version: "0.7.1"

helm/cluster-aws/README.md

Lines changed: 12 additions & 2 deletions
Large diffs are not rendered by default.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
global:
2+
release:
3+
version: 29.1.0
4+
metadata:
5+
name: node-problem-detector-used
6+
organization: test
7+
servicePriority: lowest
8+
connectivity:
9+
baseDomain: example.com
10+
controlPlane:
11+
machineHealthCheck:
12+
enabled: true
13+
diskFullVarLogTimeout: "1337m"
14+
providerSpecific:
15+
region: "eu-west-1"
16+
managementCluster: test
17+
18+
cluster:
19+
internal:
20+
ephemeralConfiguration:
21+
offlineTesting:
22+
renderWithoutReleaseResource: true
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
global:
2+
release:
3+
version: 29.1.0
4+
metadata:
5+
name: node-problem-detector-used
6+
organization: test
7+
servicePriority: lowest
8+
connectivity:
9+
baseDomain: example.com
10+
nodePools:
11+
pool0:
12+
maxSize: 3
13+
minSize: 3
14+
machineHealthCheck:
15+
enabled: true
16+
diskFullContainerdTimeout: "1337m"
17+
providerSpecific:
18+
region: "eu-west-1"
19+
managementCluster: test
20+
21+
cluster:
22+
internal:
23+
ephemeralConfiguration:
24+
offlineTesting:
25+
renderWithoutReleaseResource: true

helm/cluster-aws/values.schema.json

Lines changed: 95 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,86 @@
168168
}
169169
}
170170
},
171+
"machineHealthCheck": {
172+
"type": "object",
173+
"title": "Machine health check",
174+
"required": [
175+
"enabled"
176+
],
177+
"properties": {
178+
"diskFullContainerdTimeout": {
179+
"type": "string",
180+
"title": "DiskFullContainerd timeout",
181+
"description": "Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
182+
"examples": [
183+
"10m",
184+
"100s"
185+
],
186+
"default": ""
187+
},
188+
"diskFullKubeletTimeout": {
189+
"type": "string",
190+
"title": "DiskFullKubelet timeout",
191+
"description": "Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
192+
"examples": [
193+
"10m",
194+
"100s"
195+
],
196+
"default": ""
197+
},
198+
"diskFullVarLogTimeout": {
199+
"type": "string",
200+
"title": "DiskFullVarLog timeout",
201+
"description": "Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
202+
"examples": [
203+
"10m",
204+
"100s"
205+
],
206+
"default": ""
207+
},
208+
"enabled": {
209+
"type": "boolean",
210+
"title": "Enable",
211+
"default": true
212+
},
213+
"maxUnhealthy": {
214+
"type": "string",
215+
"title": "Maximum unhealthy nodes",
216+
"description": "Defaults to 40% for control plane nodes and 20% for worker nodes.",
217+
"examples": [
218+
"40%"
219+
]
220+
},
221+
"nodeStartupTimeout": {
222+
"type": "string",
223+
"title": "Node startup timeout",
224+
"description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
225+
"examples": [
226+
"10m",
227+
"100s"
228+
],
229+
"default": "8m0s"
230+
},
231+
"unhealthyNotReadyTimeout": {
232+
"type": "string",
233+
"title": "Timeout for ready",
234+
"description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
235+
"examples": [
236+
"300s"
237+
],
238+
"default": "10m0s"
239+
},
240+
"unhealthyUnknownTimeout": {
241+
"type": "string",
242+
"title": "Timeout for unknown condition",
243+
"description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
244+
"examples": [
245+
"300s"
246+
],
247+
"default": "10m0s"
248+
}
249+
}
250+
},
171251
"machinePool": {
172252
"type": "object",
173253
"title": "Node pool",
@@ -664,6 +744,10 @@
664744
"description": "Size of the volume mounted at `/var/log` on the worker nodes.",
665745
"default": 30
666746
},
747+
"machineHealthCheck": {
748+
"$ref": "#/$defs/machineHealthCheck",
749+
"title": "Machine health check"
750+
},
667751
"maxHealthyPercentage": {
668752
"type": "integer",
669753
"title": "Maximum percentage of instances that can be in service when replacing instances.",
@@ -834,6 +918,9 @@
834918
"nodeExporter": {
835919
"enable": true
836920
},
921+
"nodeProblemDetector": {
922+
"enable": true
923+
},
837924
"observabilityBundle": {
838925
"enable": true
839926
},
@@ -1365,6 +1452,12 @@
13651452
"title": "node-exporter",
13661453
"description": "Configuration of node-exporter. For all available values see https://github.com/giantswarm/node-exporter-app."
13671454
},
1455+
"nodeProblemDetector": {
1456+
"$ref": "#/$defs/helmRelease",
1457+
"type": "object",
1458+
"title": "node-problem-detector",
1459+
"description": "Configuration of node-problem-detector-app. For all available values see https://github.com/giantswarm/node-problem-detector-app."
1460+
},
13681461
"observabilityBundle": {
13691462
"$ref": "#/$defs/app",
13701463
"type": "object",
@@ -2164,52 +2257,8 @@
21642257
"default": 15
21652258
},
21662259
"machineHealthCheck": {
2167-
"type": "object",
2168-
"title": "Machine health check",
2169-
"additionalProperties": false,
2170-
"properties": {
2171-
"enabled": {
2172-
"type": "boolean",
2173-
"title": "Enable",
2174-
"default": true
2175-
},
2176-
"maxUnhealthy": {
2177-
"type": "string",
2178-
"title": "Maximum unhealthy nodes",
2179-
"examples": [
2180-
"40%"
2181-
],
2182-
"default": "40%"
2183-
},
2184-
"nodeStartupTimeout": {
2185-
"type": "string",
2186-
"title": "Node startup timeout",
2187-
"description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
2188-
"examples": [
2189-
"10m",
2190-
"100s"
2191-
],
2192-
"default": "8m0s"
2193-
},
2194-
"unhealthyNotReadyTimeout": {
2195-
"type": "string",
2196-
"title": "Timeout for ready",
2197-
"description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
2198-
"examples": [
2199-
"300s"
2200-
],
2201-
"default": "10m0s"
2202-
},
2203-
"unhealthyUnknownTimeout": {
2204-
"type": "string",
2205-
"title": "Timeout for unknown condition",
2206-
"description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
2207-
"examples": [
2208-
"300s"
2209-
],
2210-
"default": "10m0s"
2211-
}
2212-
}
2260+
"$ref": "#/$defs/machineHealthCheck",
2261+
"title": "Machine health check"
22132262
},
22142263
"oidc": {
22152264
"type": "object",

helm/cluster-aws/values.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ cluster:
4343
enable: true
4444
nodeExporter:
4545
enable: true
46+
nodeProblemDetector:
47+
enable: true
4648
observabilityBundle:
4749
enable: true
4850
observabilityPolicies:
@@ -296,6 +298,7 @@ global:
296298
netExporter: {}
297299
networkPolicies: {}
298300
nodeExporter: {}
301+
nodeProblemDetector: {}
299302
observabilityBundle: {}
300303
observabilityPolicies: {}
301304
prometheusBlackboxExporter: {}
@@ -382,8 +385,10 @@ global:
382385
libVolumeSizeGB: 40
383386
logVolumeSizeGB: 15
384387
machineHealthCheck:
388+
diskFullContainerdTimeout: ""
389+
diskFullKubeletTimeout: ""
390+
diskFullVarLogTimeout: ""
385391
enabled: true
386-
maxUnhealthy: 40%
387392
nodeStartupTimeout: 8m0s
388393
unhealthyNotReadyTimeout: 10m0s
389394
unhealthyUnknownTimeout: 10m0s

0 commit comments

Comments
 (0)