diff --git a/CHANGELOG.md b/CHANGELOG.md
index e3bf545bb..5f6d0c64c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+
+- Add node-problem-detector-app, disabled by default.
+
### Changed
- Tidy up dependencies on `azs-getter`.
diff --git a/helm/cluster-aws/Chart.lock b/helm/cluster-aws/Chart.lock
index 5181f5603..b3127187f 100644
--- a/helm/cluster-aws/Chart.lock
+++ b/helm/cluster-aws/Chart.lock
@@ -1,9 +1,9 @@
dependencies:
- name: cluster
repository: https://giantswarm.github.io/cluster-catalog
- version: 4.3.0
+ version: 4.4.0
- name: cluster-shared
repository: https://giantswarm.github.io/cluster-catalog
version: 0.7.1
-digest: sha256:98128b1252d1b43d0d6778546eded7d0be50da943a7d4e108cf95f581c0d5067
-generated: "2025-10-24T22:03:46.995588877Z"
+digest: sha256:5a5cf5f0c737e152a9196a47a8a59b573f81f7bb6b56326db2728ad917672f0d
+generated: "2025-10-28T10:02:18.817133+01:00"
diff --git a/helm/cluster-aws/Chart.yaml b/helm/cluster-aws/Chart.yaml
index b64da4b35..239fb64f9 100644
--- a/helm/cluster-aws/Chart.yaml
+++ b/helm/cluster-aws/Chart.yaml
@@ -16,7 +16,7 @@ restrictions:
- capa
dependencies:
- name: cluster
- version: "4.3.0"
+ version: "4.4.0"
repository: https://giantswarm.github.io/cluster-catalog
- name: cluster-shared
version: "0.7.1"
diff --git a/helm/cluster-aws/README.md b/helm/cluster-aws/README.md
index 7471afd41..dec66b20f 100644
--- a/helm/cluster-aws/README.md
+++ b/helm/cluster-aws/README.md
@@ -223,6 +223,13 @@ Configuration of apps that are part of the cluster.
| `global.apps.nodeExporter.extraConfigs[*].name` | **Name** - Name of the config map or secret. The object must exist in the same namespace as the cluster App.|**Type:** `[string]`
|
| `global.apps.nodeExporter.extraConfigs[*].priority` | **Priority**|**Type:** `[integer]`
**Default:** `25`|
| `global.apps.nodeExporter.values` | **Config map** - Helm Values to be passed to the app as user config.|**Type:** `[object]`
|
+| `global.apps.nodeProblemDetector` | **App** - Configuration of a default app that is part of the cluster and is deployed as a HelmRelease resource.|**Type:** `[object]`
|
+| `global.apps.nodeProblemDetector.extraConfigs` | **Extra config maps or secrets** - Extra config maps or secrets that will be used to customize to the app. The desired values must be under configmap or secret key 'values'. The values are merged in the order given, with the later values overwriting earlier, and then inline values overwriting those. Resources must be in the same namespace as the cluster.|**Type:** `[array]`
|
+| `global.apps.nodeProblemDetector.extraConfigs[*]` | **Config map or secret**|**Type:** `[object]`
|
+| `global.apps.nodeProblemDetector.extraConfigs[*].kind` | **Kind** - Specifies whether the resource is a config map or a secret.|**Type:** `[string]`
|
+| `global.apps.nodeProblemDetector.extraConfigs[*].name` | **Name** - Name of the config map or secret. The object must exist in the same namespace as the cluster App.|**Type:** `[string]`
|
+| `global.apps.nodeProblemDetector.extraConfigs[*].optional` | **Optional** - Optional marks this ValuesReference as optional. When set, a not found error for the values reference is ignored, but any ValuesKey, TargetPath or transient error will still result in a reconciliation failure.|**Type:** `[boolean]`
|
+| `global.apps.nodeProblemDetector.values` | **Values** - Values to be passed to the app. Values will have higher priority than values from configmaps.|**Type:** `[object]`
|
| `global.apps.observabilityBundle` | **App resource** - Configuration of a default app that is part of the cluster and is deployed as an App resource.|**Type:** `[object]`
|
| `global.apps.observabilityBundle.extraConfigs` | **Extra config maps or secrets** - Extra config maps or secrets that will be used to customize to the app. The desired values must be under configmap or secret key 'values'. The values are merged in the order given, with the later values overwriting earlier, and then inline values overwriting those. Resources must be in the same namespace as the cluster.|**Type:** `[array]`
|
| `global.apps.observabilityBundle.extraConfigs[*]` | **Config map or secret**|**Type:** `[object]`
|
@@ -393,8 +400,11 @@ Properties within the `.global.controlPlane` object
| `global.controlPlane.loadBalancerIngressAllowCidrBlocks[*]` | **Address range**|**Type:** `[string]`
|
| `global.controlPlane.logVolumeSizeGB` | **Log volume size (GB)** - Size of the volume mounted at /var/log on the control plane nodes.|**Type:** `[integer]`
**Default:** `15`|
| `global.controlPlane.machineHealthCheck` | **Machine health check**|**Type:** `[object]`
|
+| `global.controlPlane.machineHealthCheck.diskFullContainerdTimeout` | **DiskFullContainerd timeout** - Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`|
+| `global.controlPlane.machineHealthCheck.diskFullKubeletTimeout` | **DiskFullKubelet timeout** - Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`|
+| `global.controlPlane.machineHealthCheck.diskFullVarLogTimeout` | **DiskFullVarLog timeout** - Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`|
| `global.controlPlane.machineHealthCheck.enabled` | **Enable**|**Type:** `[boolean]`
**Default:** `true`|
-| `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes**|**Type:** `[string]`
**Example:** `"40%"`
**Default:** `"40%"`|
+| `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes** - Defaults to 40% for control plane nodes and 20% for worker nodes.|**Type:** `[string]`
**Example:** `"40%"`
|
| `global.controlPlane.machineHealthCheck.nodeStartupTimeout` | **Node startup timeout** - Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `"8m0s"`|
| `global.controlPlane.machineHealthCheck.unhealthyNotReadyTimeout` | **Timeout for ready** - If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.|**Type:** `[string]`
**Example:** `"300s"`
**Default:** `"10m0s"`|
| `global.controlPlane.machineHealthCheck.unhealthyUnknownTimeout` | **Timeout for unknown condition** - If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.|**Type:** `[string]`
**Example:** `"300s"`
**Default:** `"10m0s"`|
@@ -465,7 +475,7 @@ Information about the workload cluster release.
| **Property** | **Description** | **More Details** |
| :----------- | :-------------- | :--------------- |
| `baseDomain` | **Base DNS domain**|**Type:** `[string]`
|
-| `cluster` | **Cluster** - Helm values for the provider-independent cluster chart|**Type:** `[object]`
**Default:** `{"providerIntegration":{"apps":{"certExporter":{"enable":true},"certManager":{"configTemplateName":"awsCertManagerHelmValues","enable":true},"chartOperatorExtensions":{"enable":true},"cilium":{"configTemplateName":"awsCiliumHelmValues","enable":true},"ciliumServiceMonitors":{"enable":true},"clusterAutoscaler":{"configTemplateName":"awsClusterAutoscalerHelmValues","enable":true},"coreDns":{"configTemplateName":"awsCorednsHelmValues","enable":true},"coreDnsExtensions":{"enable":true},"etcdDefrag":{"enable":true},"etcdKubernetesResourcesCountExporter":{"enable":true},"externalDns":{"configTemplateName":"awsExternalDnsHelmValues","enable":true},"k8sAuditMetrics":{"enable":true},"k8sDnsNodeCache":{"enable":true},"metricsServer":{"enable":true},"netExporter":{"enable":true},"networkPolicies":{"configTemplateName":"awsNetworkPoliciesHelmValues","enable":true},"nodeExporter":{"enable":true},"observabilityBundle":{"enable":true},"observabilityPolicies":{"enable":true},"prometheusBlackboxExporter":{"enable":true},"securityBundle":{"configTemplateName":"awsSecurityBundleHelmValues","enable":true},"teleportKubeAgent":{"enable":true},"verticalPodAutoscaler":{"enable":true},"verticalPodAutoscalerCrd":{"enable":true}},"clusterAnnotationsTemplateName":"awsConnectivityLabels","components":{"systemd":{"timesyncd":{"ntp":["169.254.169.123"]}}},"connectivity":{"proxy":{"noProxy":{"templateName":"awsNoProxyList","value":["elb.amazonaws.com","169.254.169.254"]}}},"controlPlane":{"kubeadmConfig":{"clusterConfiguration":{"apiServer":{"apiAudiences":{"templateName":"awsApiServerApiAudiences"},"serviceAccountIssuers":[{"templateName":"awsIrsaServiceAccountIssuer"}]}},"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdc","format":"xfs","label":"etcd","wipeFilesystem":true},"name":"etcd"},{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/etcd","where":"/var/lib/etcd"},"unit":{"defaultDependencies":false,"description":"etcd volume"}},"enabled":true,"name":"var-lib-etcd.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"var lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}}},"resources":{"infrastructureMachineTemplate":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachineTemplate","version":"v1beta2"},"infrastructureMachineTemplateSpecTemplateName":"controlplane-awsmachinetemplate-spec"}},"environmentVariables":{"hostName":"COREOS_EC2_HOSTNAME","ipv4":"COREOS_EC2_IPV4_LOCAL"},"kubeadmConfig":{"files":[{"contentFrom":{"secret":{"key":"kubelet-aws-config.sh","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/opt/bin/kubelet-aws-config.sh","permissions":"0755"},{"contentFrom":{"secret":{"key":"kubelet-aws-config.service","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/system/kubelet-aws-config.service","permissions":"0644"},{"contentFrom":{"secret":{"key":"99-unmanaged-devices.network","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/network/99-unmanaged-devices.network","permissions":"0644"}],"ignition":{"containerLinuxConfig":{"additionalConfig":{"systemd":{"units":[{"enabled":true,"name":"kubelet-aws-config.service"}]}}}}},"osImage":{"variant":"3"},"pauseProperties":{"global.connectivity.vpcMode":"private"},"provider":"aws","registry":{"templateName":"awsContainerImageRegistry"},"resourcesApi":{"bastionResourceEnabled":false,"cleanupHelmReleaseResourcesEnabled":true,"clusterResourceEnabled":true,"controlPlaneResourceEnabled":true,"helmRepositoryResourcesEnabled":true,"infrastructureCluster":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSCluster","version":"v1beta2"},"infrastructureMachinePool":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachinePool","version":"v1beta2"},"machineHealthCheckResourceEnabled":true,"machinePoolResourcesEnabled":true,"nodePoolKind":"MachinePool"},"useReleases":true,"workers":{"defaultNodePools":{"def00":{"customNodeLabels":["label=default"],"instanceType":"r6i.xlarge","instanceWarmup":600,"maxSize":3,"minHealthyPercentage":90,"minSize":3}},"kubeadmConfig":{"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}},"taints":[{"effect":"NoExecute","key":"ebs.csi.aws.com/agent-not-ready"}]}}}}`|
+| `cluster` | **Cluster** - Helm values for the provider-independent cluster chart|**Type:** `[object]`
**Default:** `{"providerIntegration":{"apps":{"certExporter":{"enable":true},"certManager":{"configTemplateName":"awsCertManagerHelmValues","enable":true},"chartOperatorExtensions":{"enable":true},"cilium":{"configTemplateName":"awsCiliumHelmValues","enable":true},"ciliumServiceMonitors":{"enable":true},"clusterAutoscaler":{"configTemplateName":"awsClusterAutoscalerHelmValues","enable":true},"coreDns":{"configTemplateName":"awsCorednsHelmValues","enable":true},"coreDnsExtensions":{"enable":true},"etcdDefrag":{"enable":true},"etcdKubernetesResourcesCountExporter":{"enable":true},"externalDns":{"configTemplateName":"awsExternalDnsHelmValues","enable":true},"k8sAuditMetrics":{"enable":true},"k8sDnsNodeCache":{"enable":true},"metricsServer":{"enable":true},"netExporter":{"enable":true},"networkPolicies":{"configTemplateName":"awsNetworkPoliciesHelmValues","enable":true},"nodeExporter":{"enable":true},"nodeProblemDetector":{"enable":true},"observabilityBundle":{"enable":true},"observabilityPolicies":{"enable":true},"prometheusBlackboxExporter":{"enable":true},"securityBundle":{"configTemplateName":"awsSecurityBundleHelmValues","enable":true},"teleportKubeAgent":{"enable":true},"verticalPodAutoscaler":{"enable":true},"verticalPodAutoscalerCrd":{"enable":true}},"clusterAnnotationsTemplateName":"awsConnectivityLabels","components":{"systemd":{"timesyncd":{"ntp":["169.254.169.123"]}}},"connectivity":{"proxy":{"noProxy":{"templateName":"awsNoProxyList","value":["elb.amazonaws.com","169.254.169.254"]}}},"controlPlane":{"kubeadmConfig":{"clusterConfiguration":{"apiServer":{"apiAudiences":{"templateName":"awsApiServerApiAudiences"},"serviceAccountIssuers":[{"templateName":"awsIrsaServiceAccountIssuer"}]}},"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdc","format":"xfs","label":"etcd","wipeFilesystem":true},"name":"etcd"},{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/etcd","where":"/var/lib/etcd"},"unit":{"defaultDependencies":false,"description":"etcd volume"}},"enabled":true,"name":"var-lib-etcd.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"var lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}}},"resources":{"infrastructureMachineTemplate":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachineTemplate","version":"v1beta2"},"infrastructureMachineTemplateSpecTemplateName":"controlplane-awsmachinetemplate-spec"}},"environmentVariables":{"hostName":"COREOS_EC2_HOSTNAME","ipv4":"COREOS_EC2_IPV4_LOCAL"},"kubeadmConfig":{"files":[{"contentFrom":{"secret":{"key":"kubelet-aws-config.sh","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/opt/bin/kubelet-aws-config.sh","permissions":"0755"},{"contentFrom":{"secret":{"key":"kubelet-aws-config.service","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/system/kubelet-aws-config.service","permissions":"0644"},{"contentFrom":{"secret":{"key":"99-unmanaged-devices.network","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/network/99-unmanaged-devices.network","permissions":"0644"}],"ignition":{"containerLinuxConfig":{"additionalConfig":{"systemd":{"units":[{"enabled":true,"name":"kubelet-aws-config.service"}]}}}}},"osImage":{"variant":"3"},"pauseProperties":{"global.connectivity.vpcMode":"private"},"provider":"aws","registry":{"templateName":"awsContainerImageRegistry"},"resourcesApi":{"bastionResourceEnabled":false,"cleanupHelmReleaseResourcesEnabled":true,"clusterResourceEnabled":true,"controlPlaneResourceEnabled":true,"helmRepositoryResourcesEnabled":true,"infrastructureCluster":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSCluster","version":"v1beta2"},"infrastructureMachinePool":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachinePool","version":"v1beta2"},"machineHealthCheckResourceEnabled":true,"machinePoolResourcesEnabled":true,"nodePoolKind":"MachinePool"},"useReleases":true,"workers":{"defaultNodePools":{"def00":{"customNodeLabels":["label=default"],"instanceType":"r6i.xlarge","instanceWarmup":600,"maxSize":3,"minHealthyPercentage":90,"minSize":3}},"kubeadmConfig":{"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}},"taints":[{"effect":"NoExecute","key":"ebs.csi.aws.com/agent-not-ready"}]}}}}`|
| `cluster-shared` | **Library chart**|**Type:** `[object]`
|
| `managementCluster` | **Management cluster** - Name of the Cluster API cluster managing this workload cluster.|**Type:** `[string]`
|
| `provider` | **Cluster API provider name**|**Type:** `[string]`
|
diff --git a/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml
new file mode 100644
index 000000000..db591593d
--- /dev/null
+++ b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml
@@ -0,0 +1,22 @@
+global:
+ release:
+ version: 29.1.0
+ metadata:
+ name: node-problem-detector-used
+ organization: test
+ servicePriority: lowest
+ connectivity:
+ baseDomain: example.com
+ controlPlane:
+ machineHealthCheck:
+ enabled: true
+ diskFullVarLogTimeout: "1337m"
+ providerSpecific:
+ region: "eu-west-1"
+ managementCluster: test
+
+cluster:
+ internal:
+ ephemeralConfiguration:
+ offlineTesting:
+ renderWithoutReleaseResource: true
diff --git a/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml
new file mode 100644
index 000000000..749410d70
--- /dev/null
+++ b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml
@@ -0,0 +1,25 @@
+global:
+ release:
+ version: 29.1.0
+ metadata:
+ name: node-problem-detector-used
+ organization: test
+ servicePriority: lowest
+ connectivity:
+ baseDomain: example.com
+ nodePools:
+ pool0:
+ maxSize: 3
+ minSize: 3
+ machineHealthCheck:
+ enabled: true
+ diskFullContainerdTimeout: "1337m"
+ providerSpecific:
+ region: "eu-west-1"
+ managementCluster: test
+
+cluster:
+ internal:
+ ephemeralConfiguration:
+ offlineTesting:
+ renderWithoutReleaseResource: true
diff --git a/helm/cluster-aws/values.schema.json b/helm/cluster-aws/values.schema.json
index 330c2abc5..2ad1b58bb 100644
--- a/helm/cluster-aws/values.schema.json
+++ b/helm/cluster-aws/values.schema.json
@@ -168,6 +168,86 @@
}
}
},
+ "machineHealthCheck": {
+ "type": "object",
+ "title": "Machine health check",
+ "required": [
+ "enabled"
+ ],
+ "properties": {
+ "diskFullContainerdTimeout": {
+ "type": "string",
+ "title": "DiskFullContainerd timeout",
+ "description": "Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
+ "examples": [
+ "10m",
+ "100s"
+ ],
+ "default": ""
+ },
+ "diskFullKubeletTimeout": {
+ "type": "string",
+ "title": "DiskFullKubelet timeout",
+ "description": "Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
+ "examples": [
+ "10m",
+ "100s"
+ ],
+ "default": ""
+ },
+ "diskFullVarLogTimeout": {
+ "type": "string",
+ "title": "DiskFullVarLog timeout",
+ "description": "Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
+ "examples": [
+ "10m",
+ "100s"
+ ],
+ "default": ""
+ },
+ "enabled": {
+ "type": "boolean",
+ "title": "Enable",
+ "default": true
+ },
+ "maxUnhealthy": {
+ "type": "string",
+ "title": "Maximum unhealthy nodes",
+ "description": "Defaults to 40% for control plane nodes and 20% for worker nodes.",
+ "examples": [
+ "40%"
+ ]
+ },
+ "nodeStartupTimeout": {
+ "type": "string",
+ "title": "Node startup timeout",
+ "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
+ "examples": [
+ "10m",
+ "100s"
+ ],
+ "default": "8m0s"
+ },
+ "unhealthyNotReadyTimeout": {
+ "type": "string",
+ "title": "Timeout for ready",
+ "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
+ "examples": [
+ "300s"
+ ],
+ "default": "10m0s"
+ },
+ "unhealthyUnknownTimeout": {
+ "type": "string",
+ "title": "Timeout for unknown condition",
+ "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
+ "examples": [
+ "300s"
+ ],
+ "default": "10m0s"
+ }
+ }
+ },
"machinePool": {
"type": "object",
"title": "Node pool",
@@ -664,6 +744,10 @@
"description": "Size of the volume mounted at `/var/log` on the worker nodes.",
"default": 30
},
+ "machineHealthCheck": {
+ "$ref": "#/$defs/machineHealthCheck",
+ "title": "Machine health check"
+ },
"maxHealthyPercentage": {
"type": "integer",
"title": "Maximum percentage of instances that can be in service when replacing instances.",
@@ -834,6 +918,9 @@
"nodeExporter": {
"enable": true
},
+ "nodeProblemDetector": {
+ "enable": true
+ },
"observabilityBundle": {
"enable": true
},
@@ -1365,6 +1452,12 @@
"title": "node-exporter",
"description": "Configuration of node-exporter. For all available values see https://github.com/giantswarm/node-exporter-app."
},
+ "nodeProblemDetector": {
+ "$ref": "#/$defs/helmRelease",
+ "type": "object",
+ "title": "node-problem-detector",
+ "description": "Configuration of node-problem-detector-app. For all available values see https://github.com/giantswarm/node-problem-detector-app."
+ },
"observabilityBundle": {
"$ref": "#/$defs/app",
"type": "object",
@@ -2164,52 +2257,8 @@
"default": 15
},
"machineHealthCheck": {
- "type": "object",
- "title": "Machine health check",
- "additionalProperties": false,
- "properties": {
- "enabled": {
- "type": "boolean",
- "title": "Enable",
- "default": true
- },
- "maxUnhealthy": {
- "type": "string",
- "title": "Maximum unhealthy nodes",
- "examples": [
- "40%"
- ],
- "default": "40%"
- },
- "nodeStartupTimeout": {
- "type": "string",
- "title": "Node startup timeout",
- "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
- "examples": [
- "10m",
- "100s"
- ],
- "default": "8m0s"
- },
- "unhealthyNotReadyTimeout": {
- "type": "string",
- "title": "Timeout for ready",
- "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
- "examples": [
- "300s"
- ],
- "default": "10m0s"
- },
- "unhealthyUnknownTimeout": {
- "type": "string",
- "title": "Timeout for unknown condition",
- "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
- "examples": [
- "300s"
- ],
- "default": "10m0s"
- }
- }
+ "$ref": "#/$defs/machineHealthCheck",
+ "title": "Machine health check"
},
"oidc": {
"type": "object",
diff --git a/helm/cluster-aws/values.yaml b/helm/cluster-aws/values.yaml
index 82857e2cf..ecb93c1db 100644
--- a/helm/cluster-aws/values.yaml
+++ b/helm/cluster-aws/values.yaml
@@ -43,6 +43,8 @@ cluster:
enable: true
nodeExporter:
enable: true
+ nodeProblemDetector:
+ enable: true
observabilityBundle:
enable: true
observabilityPolicies:
@@ -296,6 +298,7 @@ global:
netExporter: {}
networkPolicies: {}
nodeExporter: {}
+ nodeProblemDetector: {}
observabilityBundle: {}
observabilityPolicies: {}
prometheusBlackboxExporter: {}
@@ -382,8 +385,10 @@ global:
libVolumeSizeGB: 40
logVolumeSizeGB: 15
machineHealthCheck:
+ diskFullContainerdTimeout: ""
+ diskFullKubeletTimeout: ""
+ diskFullVarLogTimeout: ""
enabled: true
- maxUnhealthy: 40%
nodeStartupTimeout: 8m0s
unhealthyNotReadyTimeout: 10m0s
unhealthyUnknownTimeout: 10m0s