From 88966a84e8ad7b914b6973b409fb414c5e2ec160 Mon Sep 17 00:00:00 2001 From: Andreas Sommer Date: Tue, 21 Oct 2025 09:24:45 +0200 Subject: [PATCH 1/5] Add node-problem-detector-app, add machine health check schema --- CHANGELOG.md | 4 + helm/cluster-aws/Chart.lock | 8 +- helm/cluster-aws/Chart.yaml | 4 +- helm/cluster-aws/README.md | 5 +- ...em-detector-used-control-plane-values.yaml | 22 +++ ...-problem-detector-used-workers-values.yaml | 25 ++++ helm/cluster-aws/values.schema.json | 135 ++++++++++++------ helm/cluster-aws/values.yaml | 5 + 8 files changed, 155 insertions(+), 53 deletions(-) create mode 100644 helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml create mode 100644 helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 305ec7cdb..d38973a74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add node-problem-detector-app, disabled by default + ## [6.3.0] - 2025-10-24 ### Changed diff --git a/helm/cluster-aws/Chart.lock b/helm/cluster-aws/Chart.lock index 5181f5603..13042873d 100644 --- a/helm/cluster-aws/Chart.lock +++ b/helm/cluster-aws/Chart.lock @@ -1,9 +1,9 @@ dependencies: - name: cluster - repository: https://giantswarm.github.io/cluster-catalog - version: 4.3.0 + repository: https://giantswarm.github.io/cluster-test-catalog + version: 4.3.0-2795756206b2d7c231c07725e30ea51ad63e2a8d - name: cluster-shared repository: https://giantswarm.github.io/cluster-catalog version: 0.7.1 -digest: sha256:98128b1252d1b43d0d6778546eded7d0be50da943a7d4e108cf95f581c0d5067 -generated: "2025-10-24T22:03:46.995588877Z" +digest: sha256:9221c097bf3c9aaa096aee4f48437cd290b0c577ed991f58bfb2becdf3183797 +generated: "2025-10-27T09:41:32.757712+01:00" diff --git a/helm/cluster-aws/Chart.yaml b/helm/cluster-aws/Chart.yaml index b64da4b35..0041ddd3e 100644 --- a/helm/cluster-aws/Chart.yaml +++ b/helm/cluster-aws/Chart.yaml @@ -16,8 +16,8 @@ restrictions: - capa dependencies: - name: cluster - version: "4.3.0" - repository: https://giantswarm.github.io/cluster-catalog + version: "4.3.0-2795756206b2d7c231c07725e30ea51ad63e2a8d" + repository: https://giantswarm.github.io/cluster-test-catalog - name: cluster-shared version: "0.7.1" repository: https://giantswarm.github.io/cluster-catalog diff --git a/helm/cluster-aws/README.md b/helm/cluster-aws/README.md index 7471afd41..ff5ab2f8e 100644 --- a/helm/cluster-aws/README.md +++ b/helm/cluster-aws/README.md @@ -393,6 +393,9 @@ Properties within the `.global.controlPlane` object | `global.controlPlane.loadBalancerIngressAllowCidrBlocks[*]` | **Address range**|**Type:** `[string]`
| | `global.controlPlane.logVolumeSizeGB` | **Log volume size (GB)** - Size of the volume mounted at /var/log on the control plane nodes.|**Type:** `[integer]`
**Default:** `15`| | `global.controlPlane.machineHealthCheck` | **Machine health check**|**Type:** `[object]`
| +| `global.controlPlane.machineHealthCheck.diskFullContainerdTimeout` | **DiskFullContainerd timeout** - Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| +| `global.controlPlane.machineHealthCheck.diskFullKubeletTimeout` | **DiskFullKubelet timeout** - Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| +| `global.controlPlane.machineHealthCheck.diskFullVarLogTimeout` | **DiskFullVarLog timeout** - Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| | `global.controlPlane.machineHealthCheck.enabled` | **Enable**|**Type:** `[boolean]`
**Default:** `true`| | `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes**|**Type:** `[string]`
**Example:** `"40%"`
**Default:** `"40%"`| | `global.controlPlane.machineHealthCheck.nodeStartupTimeout` | **Node startup timeout** - Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `"8m0s"`| @@ -465,7 +468,7 @@ Information about the workload cluster release. | **Property** | **Description** | **More Details** | | :----------- | :-------------- | :--------------- | | `baseDomain` | **Base DNS domain**|**Type:** `[string]`
| -| `cluster` | **Cluster** - Helm values for the provider-independent cluster chart|**Type:** `[object]`
**Default:** `{"providerIntegration":{"apps":{"certExporter":{"enable":true},"certManager":{"configTemplateName":"awsCertManagerHelmValues","enable":true},"chartOperatorExtensions":{"enable":true},"cilium":{"configTemplateName":"awsCiliumHelmValues","enable":true},"ciliumServiceMonitors":{"enable":true},"clusterAutoscaler":{"configTemplateName":"awsClusterAutoscalerHelmValues","enable":true},"coreDns":{"configTemplateName":"awsCorednsHelmValues","enable":true},"coreDnsExtensions":{"enable":true},"etcdDefrag":{"enable":true},"etcdKubernetesResourcesCountExporter":{"enable":true},"externalDns":{"configTemplateName":"awsExternalDnsHelmValues","enable":true},"k8sAuditMetrics":{"enable":true},"k8sDnsNodeCache":{"enable":true},"metricsServer":{"enable":true},"netExporter":{"enable":true},"networkPolicies":{"configTemplateName":"awsNetworkPoliciesHelmValues","enable":true},"nodeExporter":{"enable":true},"observabilityBundle":{"enable":true},"observabilityPolicies":{"enable":true},"prometheusBlackboxExporter":{"enable":true},"securityBundle":{"configTemplateName":"awsSecurityBundleHelmValues","enable":true},"teleportKubeAgent":{"enable":true},"verticalPodAutoscaler":{"enable":true},"verticalPodAutoscalerCrd":{"enable":true}},"clusterAnnotationsTemplateName":"awsConnectivityLabels","components":{"systemd":{"timesyncd":{"ntp":["169.254.169.123"]}}},"connectivity":{"proxy":{"noProxy":{"templateName":"awsNoProxyList","value":["elb.amazonaws.com","169.254.169.254"]}}},"controlPlane":{"kubeadmConfig":{"clusterConfiguration":{"apiServer":{"apiAudiences":{"templateName":"awsApiServerApiAudiences"},"serviceAccountIssuers":[{"templateName":"awsIrsaServiceAccountIssuer"}]}},"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdc","format":"xfs","label":"etcd","wipeFilesystem":true},"name":"etcd"},{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/etcd","where":"/var/lib/etcd"},"unit":{"defaultDependencies":false,"description":"etcd volume"}},"enabled":true,"name":"var-lib-etcd.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"var lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}}},"resources":{"infrastructureMachineTemplate":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachineTemplate","version":"v1beta2"},"infrastructureMachineTemplateSpecTemplateName":"controlplane-awsmachinetemplate-spec"}},"environmentVariables":{"hostName":"COREOS_EC2_HOSTNAME","ipv4":"COREOS_EC2_IPV4_LOCAL"},"kubeadmConfig":{"files":[{"contentFrom":{"secret":{"key":"kubelet-aws-config.sh","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/opt/bin/kubelet-aws-config.sh","permissions":"0755"},{"contentFrom":{"secret":{"key":"kubelet-aws-config.service","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/system/kubelet-aws-config.service","permissions":"0644"},{"contentFrom":{"secret":{"key":"99-unmanaged-devices.network","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/network/99-unmanaged-devices.network","permissions":"0644"}],"ignition":{"containerLinuxConfig":{"additionalConfig":{"systemd":{"units":[{"enabled":true,"name":"kubelet-aws-config.service"}]}}}}},"osImage":{"variant":"3"},"pauseProperties":{"global.connectivity.vpcMode":"private"},"provider":"aws","registry":{"templateName":"awsContainerImageRegistry"},"resourcesApi":{"bastionResourceEnabled":false,"cleanupHelmReleaseResourcesEnabled":true,"clusterResourceEnabled":true,"controlPlaneResourceEnabled":true,"helmRepositoryResourcesEnabled":true,"infrastructureCluster":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSCluster","version":"v1beta2"},"infrastructureMachinePool":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachinePool","version":"v1beta2"},"machineHealthCheckResourceEnabled":true,"machinePoolResourcesEnabled":true,"nodePoolKind":"MachinePool"},"useReleases":true,"workers":{"defaultNodePools":{"def00":{"customNodeLabels":["label=default"],"instanceType":"r6i.xlarge","instanceWarmup":600,"maxSize":3,"minHealthyPercentage":90,"minSize":3}},"kubeadmConfig":{"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}},"taints":[{"effect":"NoExecute","key":"ebs.csi.aws.com/agent-not-ready"}]}}}}`| +| `cluster` | **Cluster** - Helm values for the provider-independent cluster chart|**Type:** `[object]`
**Default:** `{"providerIntegration":{"apps":{"certExporter":{"enable":true},"certManager":{"configTemplateName":"awsCertManagerHelmValues","enable":true},"chartOperatorExtensions":{"enable":true},"cilium":{"configTemplateName":"awsCiliumHelmValues","enable":true},"ciliumServiceMonitors":{"enable":true},"clusterAutoscaler":{"configTemplateName":"awsClusterAutoscalerHelmValues","enable":true},"coreDns":{"configTemplateName":"awsCorednsHelmValues","enable":true},"coreDnsExtensions":{"enable":true},"etcdDefrag":{"enable":true},"etcdKubernetesResourcesCountExporter":{"enable":true},"externalDns":{"configTemplateName":"awsExternalDnsHelmValues","enable":true},"k8sAuditMetrics":{"enable":true},"k8sDnsNodeCache":{"enable":true},"metricsServer":{"enable":true},"netExporter":{"enable":true},"networkPolicies":{"configTemplateName":"awsNetworkPoliciesHelmValues","enable":true},"nodeExporter":{"enable":true},"nodeProblemDetector":{"enable":true},"observabilityBundle":{"enable":true},"observabilityPolicies":{"enable":true},"prometheusBlackboxExporter":{"enable":true},"securityBundle":{"configTemplateName":"awsSecurityBundleHelmValues","enable":true},"teleportKubeAgent":{"enable":true},"verticalPodAutoscaler":{"enable":true},"verticalPodAutoscalerCrd":{"enable":true}},"clusterAnnotationsTemplateName":"awsConnectivityLabels","components":{"systemd":{"timesyncd":{"ntp":["169.254.169.123"]}}},"connectivity":{"proxy":{"noProxy":{"templateName":"awsNoProxyList","value":["elb.amazonaws.com","169.254.169.254"]}}},"controlPlane":{"kubeadmConfig":{"clusterConfiguration":{"apiServer":{"apiAudiences":{"templateName":"awsApiServerApiAudiences"},"serviceAccountIssuers":[{"templateName":"awsIrsaServiceAccountIssuer"}]}},"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdc","format":"xfs","label":"etcd","wipeFilesystem":true},"name":"etcd"},{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/etcd","where":"/var/lib/etcd"},"unit":{"defaultDependencies":false,"description":"etcd volume"}},"enabled":true,"name":"var-lib-etcd.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"var lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}}},"resources":{"infrastructureMachineTemplate":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachineTemplate","version":"v1beta2"},"infrastructureMachineTemplateSpecTemplateName":"controlplane-awsmachinetemplate-spec"}},"environmentVariables":{"hostName":"COREOS_EC2_HOSTNAME","ipv4":"COREOS_EC2_IPV4_LOCAL"},"kubeadmConfig":{"files":[{"contentFrom":{"secret":{"key":"kubelet-aws-config.sh","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/opt/bin/kubelet-aws-config.sh","permissions":"0755"},{"contentFrom":{"secret":{"key":"kubelet-aws-config.service","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/system/kubelet-aws-config.service","permissions":"0644"},{"contentFrom":{"secret":{"key":"99-unmanaged-devices.network","name":"provider-specific-files-4","prependClusterNameAsPrefix":true}},"path":"/etc/systemd/network/99-unmanaged-devices.network","permissions":"0644"}],"ignition":{"containerLinuxConfig":{"additionalConfig":{"systemd":{"units":[{"enabled":true,"name":"kubelet-aws-config.service"}]}}}}},"osImage":{"variant":"3"},"pauseProperties":{"global.connectivity.vpcMode":"private"},"provider":"aws","registry":{"templateName":"awsContainerImageRegistry"},"resourcesApi":{"bastionResourceEnabled":false,"cleanupHelmReleaseResourcesEnabled":true,"clusterResourceEnabled":true,"controlPlaneResourceEnabled":true,"helmRepositoryResourcesEnabled":true,"infrastructureCluster":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSCluster","version":"v1beta2"},"infrastructureMachinePool":{"group":"infrastructure.cluster.x-k8s.io","kind":"AWSMachinePool","version":"v1beta2"},"machineHealthCheckResourceEnabled":true,"machinePoolResourcesEnabled":true,"nodePoolKind":"MachinePool"},"useReleases":true,"workers":{"defaultNodePools":{"def00":{"customNodeLabels":["label=default"],"instanceType":"r6i.xlarge","instanceWarmup":600,"maxSize":3,"minHealthyPercentage":90,"minSize":3}},"kubeadmConfig":{"files":[],"ignition":{"containerLinuxConfig":{"additionalConfig":{"storage":{"filesystems":[{"mount":{"device":"/dev/xvdd","format":"xfs","label":"lib","wipeFilesystem":true},"name":"lib"},{"mount":{"device":"/dev/xvde","format":"xfs","label":"log","wipeFilesystem":true},"name":"log"}]},"systemd":{"units":[{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/lib","where":"/var/lib"},"unit":{"defaultDependencies":false,"description":"lib volume"}},"enabled":true,"name":"var-lib.mount"},{"contents":{"install":{"wantedBy":["local-fs-pre.target"]},"mount":{"type":"xfs","what":"/dev/disk/by-label/log","where":"/var/log"},"unit":{"defaultDependencies":false,"description":"log volume"}},"enabled":true,"name":"var-log.mount"}]}}}},"taints":[{"effect":"NoExecute","key":"ebs.csi.aws.com/agent-not-ready"}]}}}}`| | `cluster-shared` | **Library chart**|**Type:** `[object]`
| | `managementCluster` | **Management cluster** - Name of the Cluster API cluster managing this workload cluster.|**Type:** `[string]`
| | `provider` | **Cluster API provider name**|**Type:** `[string]`
| diff --git a/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml new file mode 100644 index 000000000..db591593d --- /dev/null +++ b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml @@ -0,0 +1,22 @@ +global: + release: + version: 29.1.0 + metadata: + name: node-problem-detector-used + organization: test + servicePriority: lowest + connectivity: + baseDomain: example.com + controlPlane: + machineHealthCheck: + enabled: true + diskFullVarLogTimeout: "1337m" + providerSpecific: + region: "eu-west-1" + managementCluster: test + +cluster: + internal: + ephemeralConfiguration: + offlineTesting: + renderWithoutReleaseResource: true diff --git a/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml new file mode 100644 index 000000000..749410d70 --- /dev/null +++ b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml @@ -0,0 +1,25 @@ +global: + release: + version: 29.1.0 + metadata: + name: node-problem-detector-used + organization: test + servicePriority: lowest + connectivity: + baseDomain: example.com + nodePools: + pool0: + maxSize: 3 + minSize: 3 + machineHealthCheck: + enabled: true + diskFullContainerdTimeout: "1337m" + providerSpecific: + region: "eu-west-1" + managementCluster: test + +cluster: + internal: + ephemeralConfiguration: + offlineTesting: + renderWithoutReleaseResource: true diff --git a/helm/cluster-aws/values.schema.json b/helm/cluster-aws/values.schema.json index 5df498b13..12392f14b 100644 --- a/helm/cluster-aws/values.schema.json +++ b/helm/cluster-aws/values.schema.json @@ -168,6 +168,86 @@ } } }, + "machineHealthCheck": { + "type": "object", + "title": "Machine health check", + "required": [ + "enabled" + ], + "properties": { + "diskFullContainerdTimeout": { + "type": "string", + "title": "DiskFullContainerd timeout", + "description": "Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.", + "examples": [ + "10m", + "100s" + ], + "default": "" + }, + "diskFullKubeletTimeout": { + "type": "string", + "title": "DiskFullKubelet timeout", + "description": "Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.", + "examples": [ + "10m", + "100s" + ], + "default": "" + }, + "diskFullVarLogTimeout": { + "type": "string", + "title": "DiskFullVarLog timeout", + "description": "Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.", + "examples": [ + "10m", + "100s" + ], + "default": "" + }, + "enabled": { + "type": "boolean", + "title": "Enable", + "default": true + }, + "maxUnhealthy": { + "type": "string", + "title": "Maximum unhealthy nodes", + "examples": [ + "40%" + ], + "default": "40%" + }, + "nodeStartupTimeout": { + "type": "string", + "title": "Node startup timeout", + "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.", + "examples": [ + "10m", + "100s" + ], + "default": "8m0s" + }, + "unhealthyNotReadyTimeout": { + "type": "string", + "title": "Timeout for ready", + "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.", + "examples": [ + "300s" + ], + "default": "10m0s" + }, + "unhealthyUnknownTimeout": { + "type": "string", + "title": "Timeout for unknown condition", + "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.", + "examples": [ + "300s" + ], + "default": "10m0s" + } + } + }, "machinePool": { "type": "object", "title": "Node pool", @@ -664,6 +744,10 @@ "description": "Size of the volume mounted at `/var/log` on the worker nodes.", "default": 30 }, + "machineHealthCheck": { + "$ref": "#/$defs/machineHealthCheck", + "title": "Machine health check" + }, "maxHealthyPercentage": { "type": "integer", "title": "Maximum percentage of instances that can be in service when replacing instances.", @@ -834,6 +918,9 @@ "nodeExporter": { "enable": true }, + "nodeProblemDetector": { + "enable": true + }, "observabilityBundle": { "enable": true }, @@ -2160,52 +2247,8 @@ "default": 15 }, "machineHealthCheck": { - "type": "object", - "title": "Machine health check", - "additionalProperties": false, - "properties": { - "enabled": { - "type": "boolean", - "title": "Enable", - "default": true - }, - "maxUnhealthy": { - "type": "string", - "title": "Maximum unhealthy nodes", - "examples": [ - "40%" - ], - "default": "40%" - }, - "nodeStartupTimeout": { - "type": "string", - "title": "Node startup timeout", - "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.", - "examples": [ - "10m", - "100s" - ], - "default": "8m0s" - }, - "unhealthyNotReadyTimeout": { - "type": "string", - "title": "Timeout for ready", - "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.", - "examples": [ - "300s" - ], - "default": "10m0s" - }, - "unhealthyUnknownTimeout": { - "type": "string", - "title": "Timeout for unknown condition", - "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.", - "examples": [ - "300s" - ], - "default": "10m0s" - } - } + "$ref": "#/$defs/machineHealthCheck", + "title": "Machine health check" }, "oidc": { "type": "object", diff --git a/helm/cluster-aws/values.yaml b/helm/cluster-aws/values.yaml index 82857e2cf..79e2f01dc 100644 --- a/helm/cluster-aws/values.yaml +++ b/helm/cluster-aws/values.yaml @@ -43,6 +43,8 @@ cluster: enable: true nodeExporter: enable: true + nodeProblemDetector: + enable: true observabilityBundle: enable: true observabilityPolicies: @@ -382,6 +384,9 @@ global: libVolumeSizeGB: 40 logVolumeSizeGB: 15 machineHealthCheck: + diskFullContainerdTimeout: "" + diskFullKubeletTimeout: "" + diskFullVarLogTimeout: "" enabled: true maxUnhealthy: 40% nodeStartupTimeout: 8m0s From 69b769dfa0d1b5fd2c47777709f3846d729f17f7 Mon Sep 17 00:00:00 2001 From: Andreas Sommer Date: Tue, 28 Oct 2025 10:02:52 +0100 Subject: [PATCH 2/5] Switch to final version of cluster chart --- helm/cluster-aws/Chart.lock | 8 ++++---- helm/cluster-aws/Chart.yaml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/helm/cluster-aws/Chart.lock b/helm/cluster-aws/Chart.lock index 13042873d..b3127187f 100644 --- a/helm/cluster-aws/Chart.lock +++ b/helm/cluster-aws/Chart.lock @@ -1,9 +1,9 @@ dependencies: - name: cluster - repository: https://giantswarm.github.io/cluster-test-catalog - version: 4.3.0-2795756206b2d7c231c07725e30ea51ad63e2a8d + repository: https://giantswarm.github.io/cluster-catalog + version: 4.4.0 - name: cluster-shared repository: https://giantswarm.github.io/cluster-catalog version: 0.7.1 -digest: sha256:9221c097bf3c9aaa096aee4f48437cd290b0c577ed991f58bfb2becdf3183797 -generated: "2025-10-27T09:41:32.757712+01:00" +digest: sha256:5a5cf5f0c737e152a9196a47a8a59b573f81f7bb6b56326db2728ad917672f0d +generated: "2025-10-28T10:02:18.817133+01:00" diff --git a/helm/cluster-aws/Chart.yaml b/helm/cluster-aws/Chart.yaml index 0041ddd3e..239fb64f9 100644 --- a/helm/cluster-aws/Chart.yaml +++ b/helm/cluster-aws/Chart.yaml @@ -16,8 +16,8 @@ restrictions: - capa dependencies: - name: cluster - version: "4.3.0-2795756206b2d7c231c07725e30ea51ad63e2a8d" - repository: https://giantswarm.github.io/cluster-test-catalog + version: "4.4.0" + repository: https://giantswarm.github.io/cluster-catalog - name: cluster-shared version: "0.7.1" repository: https://giantswarm.github.io/cluster-catalog From dbed9ae1eacd1da7f804b27a0fcadb824ce913e0 Mon Sep 17 00:00:00 2001 From: Andreas Sommer Date: Tue, 28 Oct 2025 10:10:21 +0100 Subject: [PATCH 3/5] Add description, remove `maxUnhealthy` default because cluster chart does the defaulting --- helm/cluster-aws/README.md | 2 +- helm/cluster-aws/values.schema.json | 4 ++-- helm/cluster-aws/values.yaml | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/helm/cluster-aws/README.md b/helm/cluster-aws/README.md index ff5ab2f8e..862cb72fb 100644 --- a/helm/cluster-aws/README.md +++ b/helm/cluster-aws/README.md @@ -397,7 +397,7 @@ Properties within the `.global.controlPlane` object | `global.controlPlane.machineHealthCheck.diskFullKubeletTimeout` | **DiskFullKubelet timeout** - Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| | `global.controlPlane.machineHealthCheck.diskFullVarLogTimeout` | **DiskFullVarLog timeout** - Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| | `global.controlPlane.machineHealthCheck.enabled` | **Enable**|**Type:** `[boolean]`
**Default:** `true`| -| `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes**|**Type:** `[string]`
**Example:** `"40%"`
**Default:** `"40%"`| +| `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes** - Remediation (= machine deletion) is only performed if at most `maxUnhealthy` machines are unhealthy.|**Type:** `[string]`
**Example:** `"40%"`
| | `global.controlPlane.machineHealthCheck.nodeStartupTimeout` | **Node startup timeout** - Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `"8m0s"`| | `global.controlPlane.machineHealthCheck.unhealthyNotReadyTimeout` | **Timeout for ready** - If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.|**Type:** `[string]`
**Example:** `"300s"`
**Default:** `"10m0s"`| | `global.controlPlane.machineHealthCheck.unhealthyUnknownTimeout` | **Timeout for unknown condition** - If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.|**Type:** `[string]`
**Example:** `"300s"`
**Default:** `"10m0s"`| diff --git a/helm/cluster-aws/values.schema.json b/helm/cluster-aws/values.schema.json index 12392f14b..b65493ba0 100644 --- a/helm/cluster-aws/values.schema.json +++ b/helm/cluster-aws/values.schema.json @@ -213,10 +213,10 @@ "maxUnhealthy": { "type": "string", "title": "Maximum unhealthy nodes", + "description": "Remediation (= machine deletion) is only performed if at most `maxUnhealthy` machines are unhealthy.", "examples": [ "40%" - ], - "default": "40%" + ] }, "nodeStartupTimeout": { "type": "string", diff --git a/helm/cluster-aws/values.yaml b/helm/cluster-aws/values.yaml index 79e2f01dc..4f3af4238 100644 --- a/helm/cluster-aws/values.yaml +++ b/helm/cluster-aws/values.yaml @@ -388,7 +388,6 @@ global: diskFullKubeletTimeout: "" diskFullVarLogTimeout: "" enabled: true - maxUnhealthy: 40% nodeStartupTimeout: 8m0s unhealthyNotReadyTimeout: 10m0s unhealthyUnknownTimeout: 10m0s From 6f340176a031f4cdfce9237df3bb6b6fee456b9e Mon Sep 17 00:00:00 2001 From: Andreas Sommer Date: Tue, 28 Oct 2025 10:11:10 +0100 Subject: [PATCH 4/5] Apply suggestion from @fiunchinho Co-authored-by: Jose Armesto --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d38973a74..bdcdc3740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add node-problem-detector-app, disabled by default +- Add node-problem-detector-app, disabled by default. ## [6.3.0] - 2025-10-24 From fe8b7e071633ecd269b95eb71c6625f4c0c81545 Mon Sep 17 00:00:00 2001 From: Marco Ebert Date: Tue, 28 Oct 2025 15:41:37 +0100 Subject: [PATCH 5/5] Add missing node problem detector config. --- helm/cluster-aws/README.md | 9 ++++++++- helm/cluster-aws/values.schema.json | 8 +++++++- helm/cluster-aws/values.yaml | 1 + 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/helm/cluster-aws/README.md b/helm/cluster-aws/README.md index 862cb72fb..dec66b20f 100644 --- a/helm/cluster-aws/README.md +++ b/helm/cluster-aws/README.md @@ -223,6 +223,13 @@ Configuration of apps that are part of the cluster. | `global.apps.nodeExporter.extraConfigs[*].name` | **Name** - Name of the config map or secret. The object must exist in the same namespace as the cluster App.|**Type:** `[string]`
| | `global.apps.nodeExporter.extraConfigs[*].priority` | **Priority**|**Type:** `[integer]`
**Default:** `25`| | `global.apps.nodeExporter.values` | **Config map** - Helm Values to be passed to the app as user config.|**Type:** `[object]`
| +| `global.apps.nodeProblemDetector` | **App** - Configuration of a default app that is part of the cluster and is deployed as a HelmRelease resource.|**Type:** `[object]`
| +| `global.apps.nodeProblemDetector.extraConfigs` | **Extra config maps or secrets** - Extra config maps or secrets that will be used to customize to the app. The desired values must be under configmap or secret key 'values'. The values are merged in the order given, with the later values overwriting earlier, and then inline values overwriting those. Resources must be in the same namespace as the cluster.|**Type:** `[array]`
| +| `global.apps.nodeProblemDetector.extraConfigs[*]` | **Config map or secret**|**Type:** `[object]`
| +| `global.apps.nodeProblemDetector.extraConfigs[*].kind` | **Kind** - Specifies whether the resource is a config map or a secret.|**Type:** `[string]`
| +| `global.apps.nodeProblemDetector.extraConfigs[*].name` | **Name** - Name of the config map or secret. The object must exist in the same namespace as the cluster App.|**Type:** `[string]`
| +| `global.apps.nodeProblemDetector.extraConfigs[*].optional` | **Optional** - Optional marks this ValuesReference as optional. When set, a not found error for the values reference is ignored, but any ValuesKey, TargetPath or transient error will still result in a reconciliation failure.|**Type:** `[boolean]`
| +| `global.apps.nodeProblemDetector.values` | **Values** - Values to be passed to the app. Values will have higher priority than values from configmaps.|**Type:** `[object]`
| | `global.apps.observabilityBundle` | **App resource** - Configuration of a default app that is part of the cluster and is deployed as an App resource.|**Type:** `[object]`
| | `global.apps.observabilityBundle.extraConfigs` | **Extra config maps or secrets** - Extra config maps or secrets that will be used to customize to the app. The desired values must be under configmap or secret key 'values'. The values are merged in the order given, with the later values overwriting earlier, and then inline values overwriting those. Resources must be in the same namespace as the cluster.|**Type:** `[array]`
| | `global.apps.observabilityBundle.extraConfigs[*]` | **Config map or secret**|**Type:** `[object]`
| @@ -397,7 +404,7 @@ Properties within the `.global.controlPlane` object | `global.controlPlane.machineHealthCheck.diskFullKubeletTimeout` | **DiskFullKubelet timeout** - Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| | `global.controlPlane.machineHealthCheck.diskFullVarLogTimeout` | **DiskFullVarLog timeout** - Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `""`| | `global.controlPlane.machineHealthCheck.enabled` | **Enable**|**Type:** `[boolean]`
**Default:** `true`| -| `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes** - Remediation (= machine deletion) is only performed if at most `maxUnhealthy` machines are unhealthy.|**Type:** `[string]`
**Example:** `"40%"`
| +| `global.controlPlane.machineHealthCheck.maxUnhealthy` | **Maximum unhealthy nodes** - Defaults to 40% for control plane nodes and 20% for worker nodes.|**Type:** `[string]`
**Example:** `"40%"`
| | `global.controlPlane.machineHealthCheck.nodeStartupTimeout` | **Node startup timeout** - Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.|**Type:** `[string]`
**Examples:** `"10m", "100s"`
**Default:** `"8m0s"`| | `global.controlPlane.machineHealthCheck.unhealthyNotReadyTimeout` | **Timeout for ready** - If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.|**Type:** `[string]`
**Example:** `"300s"`
**Default:** `"10m0s"`| | `global.controlPlane.machineHealthCheck.unhealthyUnknownTimeout` | **Timeout for unknown condition** - If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.|**Type:** `[string]`
**Example:** `"300s"`
**Default:** `"10m0s"`| diff --git a/helm/cluster-aws/values.schema.json b/helm/cluster-aws/values.schema.json index ca3ea0748..2ad1b58bb 100644 --- a/helm/cluster-aws/values.schema.json +++ b/helm/cluster-aws/values.schema.json @@ -213,7 +213,7 @@ "maxUnhealthy": { "type": "string", "title": "Maximum unhealthy nodes", - "description": "Remediation (= machine deletion) is only performed if at most `maxUnhealthy` machines are unhealthy.", + "description": "Defaults to 40% for control plane nodes and 20% for worker nodes.", "examples": [ "40%" ] @@ -1452,6 +1452,12 @@ "title": "node-exporter", "description": "Configuration of node-exporter. For all available values see https://github.com/giantswarm/node-exporter-app." }, + "nodeProblemDetector": { + "$ref": "#/$defs/helmRelease", + "type": "object", + "title": "node-problem-detector", + "description": "Configuration of node-problem-detector-app. For all available values see https://github.com/giantswarm/node-problem-detector-app." + }, "observabilityBundle": { "$ref": "#/$defs/app", "type": "object", diff --git a/helm/cluster-aws/values.yaml b/helm/cluster-aws/values.yaml index 4f3af4238..ecb93c1db 100644 --- a/helm/cluster-aws/values.yaml +++ b/helm/cluster-aws/values.yaml @@ -298,6 +298,7 @@ global: netExporter: {} networkPolicies: {} nodeExporter: {} + nodeProblemDetector: {} observabilityBundle: {} observabilityPolicies: {} prometheusBlackboxExporter: {}