Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Add node-problem-detector-app, disabled by default.

### Changed

- Tidy up dependencies on `azs-getter`.
Expand Down
6 changes: 3 additions & 3 deletions helm/cluster-aws/Chart.lock
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dependencies:
- name: cluster
repository: https://giantswarm.github.io/cluster-catalog
version: 4.3.0
version: 4.4.0
- name: cluster-shared
repository: https://giantswarm.github.io/cluster-catalog
version: 0.7.1
digest: sha256:98128b1252d1b43d0d6778546eded7d0be50da943a7d4e108cf95f581c0d5067
generated: "2025-10-24T22:03:46.995588877Z"
digest: sha256:5a5cf5f0c737e152a9196a47a8a59b573f81f7bb6b56326db2728ad917672f0d
generated: "2025-10-28T10:02:18.817133+01:00"
2 changes: 1 addition & 1 deletion helm/cluster-aws/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ restrictions:
- capa
dependencies:
- name: cluster
version: "4.3.0"
version: "4.4.0"
repository: https://giantswarm.github.io/cluster-catalog
- name: cluster-shared
version: "0.7.1"
Expand Down
14 changes: 12 additions & 2 deletions helm/cluster-aws/README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
global:
release:
version: 29.1.0
metadata:
name: node-problem-detector-used
organization: test
servicePriority: lowest
connectivity:
baseDomain: example.com
controlPlane:
machineHealthCheck:
enabled: true
diskFullVarLogTimeout: "1337m"
providerSpecific:
region: "eu-west-1"
managementCluster: test

cluster:
internal:
ephemeralConfiguration:
offlineTesting:
renderWithoutReleaseResource: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
global:
release:
version: 29.1.0
metadata:
name: node-problem-detector-used
organization: test
servicePriority: lowest
connectivity:
baseDomain: example.com
nodePools:
pool0:
maxSize: 3
minSize: 3
machineHealthCheck:
enabled: true
diskFullContainerdTimeout: "1337m"
providerSpecific:
region: "eu-west-1"
managementCluster: test

cluster:
internal:
ephemeralConfiguration:
offlineTesting:
renderWithoutReleaseResource: true
141 changes: 95 additions & 46 deletions helm/cluster-aws/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,86 @@
}
}
},
"machineHealthCheck": {
"type": "object",
"title": "Machine health check",
"required": [
"enabled"
],
"properties": {
"diskFullContainerdTimeout": {
"type": "string",
"title": "DiskFullContainerd timeout",
"description": "Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
"examples": [
"10m",
"100s"
],
"default": ""
},
"diskFullKubeletTimeout": {
"type": "string",
"title": "DiskFullKubelet timeout",
"description": "Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
"examples": [
"10m",
"100s"
],
"default": ""
},
"diskFullVarLogTimeout": {
"type": "string",
"title": "DiskFullVarLog timeout",
"description": "Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
"examples": [
"10m",
"100s"
],
"default": ""
},
"enabled": {
"type": "boolean",
"title": "Enable",
"default": true
},
"maxUnhealthy": {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is missing a description.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"type": "string",
"title": "Maximum unhealthy nodes",
"description": "Defaults to 40% for control plane nodes and 20% for worker nodes.",
"examples": [
"40%"
]
},
"nodeStartupTimeout": {
"type": "string",
"title": "Node startup timeout",
"description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
"examples": [
"10m",
"100s"
],
"default": "8m0s"
},
"unhealthyNotReadyTimeout": {
"type": "string",
"title": "Timeout for ready",
"description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
"examples": [
"300s"
],
"default": "10m0s"
},
"unhealthyUnknownTimeout": {
"type": "string",
"title": "Timeout for unknown condition",
"description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
"examples": [
"300s"
],
"default": "10m0s"
}
}
},
"machinePool": {
"type": "object",
"title": "Node pool",
Expand Down Expand Up @@ -664,6 +744,10 @@
"description": "Size of the volume mounted at `/var/log` on the worker nodes.",
"default": 30
},
"machineHealthCheck": {
"$ref": "#/$defs/machineHealthCheck",
"title": "Machine health check"
},
"maxHealthyPercentage": {
"type": "integer",
"title": "Maximum percentage of instances that can be in service when replacing instances.",
Expand Down Expand Up @@ -834,6 +918,9 @@
"nodeExporter": {
"enable": true
},
"nodeProblemDetector": {
"enable": true
},
"observabilityBundle": {
"enable": true
},
Expand Down Expand Up @@ -1365,6 +1452,12 @@
"title": "node-exporter",
"description": "Configuration of node-exporter. For all available values see https://github.com/giantswarm/node-exporter-app."
},
"nodeProblemDetector": {
"$ref": "#/$defs/helmRelease",
"type": "object",
"title": "node-problem-detector",
"description": "Configuration of node-problem-detector-app. For all available values see https://github.com/giantswarm/node-problem-detector-app."
},
"observabilityBundle": {
"$ref": "#/$defs/app",
"type": "object",
Expand Down Expand Up @@ -2164,52 +2257,8 @@
"default": 15
},
"machineHealthCheck": {
"type": "object",
"title": "Machine health check",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean",
"title": "Enable",
"default": true
},
"maxUnhealthy": {
"type": "string",
"title": "Maximum unhealthy nodes",
"examples": [
"40%"
],
"default": "40%"
},
"nodeStartupTimeout": {
"type": "string",
"title": "Node startup timeout",
"description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
"examples": [
"10m",
"100s"
],
"default": "8m0s"
},
"unhealthyNotReadyTimeout": {
"type": "string",
"title": "Timeout for ready",
"description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
"examples": [
"300s"
],
"default": "10m0s"
},
"unhealthyUnknownTimeout": {
"type": "string",
"title": "Timeout for unknown condition",
"description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
"examples": [
"300s"
],
"default": "10m0s"
}
}
"$ref": "#/$defs/machineHealthCheck",
"title": "Machine health check"
},
"oidc": {
"type": "object",
Expand Down
7 changes: 6 additions & 1 deletion helm/cluster-aws/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ cluster:
enable: true
nodeExporter:
enable: true
nodeProblemDetector:
enable: true
observabilityBundle:
enable: true
observabilityPolicies:
Expand Down Expand Up @@ -296,6 +298,7 @@ global:
netExporter: {}
networkPolicies: {}
nodeExporter: {}
nodeProblemDetector: {}
observabilityBundle: {}
observabilityPolicies: {}
prometheusBlackboxExporter: {}
Expand Down Expand Up @@ -382,8 +385,10 @@ global:
libVolumeSizeGB: 40
logVolumeSizeGB: 15
machineHealthCheck:
diskFullContainerdTimeout: ""
diskFullKubeletTimeout: ""
diskFullVarLogTimeout: ""
enabled: true
maxUnhealthy: 40%
nodeStartupTimeout: 8m0s
unhealthyNotReadyTimeout: 10m0s
unhealthyUnknownTimeout: 10m0s
Expand Down
Loading