Merge branch 'main' into iam-workers-crossplane

fiunchinho · web-flow · commit 88ee5ea9e27e · 2025-10-28T17:57:11.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add node-problem-detector-app, disabled by default.
 - Add Crossplane IAM Roles, policies and instance profiles for the worker nodes. Instead of having an IAM Role per node pool, now we'll use the same for all node pools. *This change will roll the worker nodes*.
 
 ### Changed
diff --git a/helm/cluster-aws/Chart.lock b/helm/cluster-aws/Chart.lock
@@ -1,9 +1,9 @@
 dependencies:
 - name: cluster
   repository: https://giantswarm.github.io/cluster-catalog
-  version: 4.3.0
+  version: 4.4.0
 - name: cluster-shared
   repository: https://giantswarm.github.io/cluster-catalog
   version: 0.7.1
-digest: sha256:98128b1252d1b43d0d6778546eded7d0be50da943a7d4e108cf95f581c0d5067
-generated: "2025-10-24T22:03:46.995588877Z"
+digest: sha256:5a5cf5f0c737e152a9196a47a8a59b573f81f7bb6b56326db2728ad917672f0d
+generated: "2025-10-28T10:02:18.817133+01:00"
diff --git a/helm/cluster-aws/Chart.yaml b/helm/cluster-aws/Chart.yaml
@@ -16,7 +16,7 @@ restrictions:
     - capa
 dependencies:
   - name: cluster
-    version: "4.3.0"
+    version: "4.4.0"
     repository: https://giantswarm.github.io/cluster-catalog
   - name: cluster-shared
     version: "0.7.1"
diff --git a/helm/cluster-aws/README.md b/helm/cluster-aws/README.md
diff --git a/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-control-plane-values.yaml
@@ -0,0 +1,22 @@
+global:
+  release:
+    version: 29.1.0
+  metadata:
+    name: node-problem-detector-used
+    organization: test
+    servicePriority: lowest
+  connectivity:
+    baseDomain: example.com
+  controlPlane:
+    machineHealthCheck:
+      enabled: true
+      diskFullVarLogTimeout: "1337m"
+  providerSpecific:
+    region: "eu-west-1"
+  managementCluster: test
+
+cluster:
+  internal:
+    ephemeralConfiguration:
+      offlineTesting:
+        renderWithoutReleaseResource: true
diff --git a/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml b/helm/cluster-aws/ci/test-machinehealthcheck-node-problem-detector-used-workers-values.yaml
@@ -0,0 +1,25 @@
+global:
+  release:
+    version: 29.1.0
+  metadata:
+    name: node-problem-detector-used
+    organization: test
+    servicePriority: lowest
+  connectivity:
+    baseDomain: example.com
+  nodePools:
+    pool0:
+      maxSize: 3
+      minSize: 3
+      machineHealthCheck:
+        enabled: true
+        diskFullContainerdTimeout: "1337m"
+  providerSpecific:
+    region: "eu-west-1"
+  managementCluster: test
+
+cluster:
+  internal:
+    ephemeralConfiguration:
+      offlineTesting:
+        renderWithoutReleaseResource: true
diff --git a/helm/cluster-aws/values.schema.json b/helm/cluster-aws/values.schema.json
@@ -168,6 +168,86 @@
                 }
             }
         },
+        "machineHealthCheck": {
+            "type": "object",
+            "title": "Machine health check",
+            "required": [
+                "enabled"
+            ],
+            "properties": {
+                "diskFullContainerdTimeout": {
+                    "type": "string",
+                    "title": "DiskFullContainerd timeout",
+                    "description": "Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
+                    "examples": [
+                        "10m",
+                        "100s"
+                    ],
+                    "default": ""
+                },
+                "diskFullKubeletTimeout": {
+                    "type": "string",
+                    "title": "DiskFullKubelet timeout",
+                    "description": "Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
+                    "examples": [
+                        "10m",
+                        "100s"
+                    ],
+                    "default": ""
+                },
+                "diskFullVarLogTimeout": {
+                    "type": "string",
+                    "title": "DiskFullVarLog timeout",
+                    "description": "Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.",
+                    "examples": [
+                        "10m",
+                        "100s"
+                    ],
+                    "default": ""
+                },
+                "enabled": {
+                    "type": "boolean",
+                    "title": "Enable",
+                    "default": true
+                },
+                "maxUnhealthy": {
+                    "type": "string",
+                    "title": "Maximum unhealthy nodes",
+                    "description": "Defaults to 40% for control plane nodes and 20% for worker nodes.",
+                    "examples": [
+                        "40%"
+                    ]
+                },
+                "nodeStartupTimeout": {
+                    "type": "string",
+                    "title": "Node startup timeout",
+                    "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
+                    "examples": [
+                        "10m",
+                        "100s"
+                    ],
+                    "default": "8m0s"
+                },
+                "unhealthyNotReadyTimeout": {
+                    "type": "string",
+                    "title": "Timeout for ready",
+                    "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
+                    "examples": [
+                        "300s"
+                    ],
+                    "default": "10m0s"
+                },
+                "unhealthyUnknownTimeout": {
+                    "type": "string",
+                    "title": "Timeout for unknown condition",
+                    "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
+                    "examples": [
+                        "300s"
+                    ],
+                    "default": "10m0s"
+                }
+            }
+        },
         "machinePool": {
             "type": "object",
             "title": "Node pool",
@@ -664,6 +744,10 @@
                                     "description": "Size of the volume mounted at `/var/log` on the worker nodes.",
                                     "default": 30
                                 },
+                                "machineHealthCheck": {
+                                    "$ref": "#/$defs/machineHealthCheck",
+                                    "title": "Machine health check"
+                                },
                                 "maxHealthyPercentage": {
                                     "type": "integer",
                                     "title": "Maximum percentage of instances that can be in service when replacing instances.",
@@ -834,6 +918,9 @@
                         "nodeExporter": {
                             "enable": true
                         },
+                        "nodeProblemDetector": {
+                            "enable": true
+                        },
                         "observabilityBundle": {
                             "enable": true
                         },
@@ -1365,6 +1452,12 @@
                             "title": "node-exporter",
                             "description": "Configuration of node-exporter. For all available values see https://github.com/giantswarm/node-exporter-app."
                         },
+                        "nodeProblemDetector": {
+                            "$ref": "#/$defs/helmRelease",
+                            "type": "object",
+                            "title": "node-problem-detector",
+                            "description": "Configuration of node-problem-detector-app. For all available values see https://github.com/giantswarm/node-problem-detector-app."
+                        },
                         "observabilityBundle": {
                             "$ref": "#/$defs/app",
                             "type": "object",
@@ -2164,52 +2257,8 @@
                             "default": 15
                         },
                         "machineHealthCheck": {
-                            "type": "object",
-                            "title": "Machine health check",
-                            "additionalProperties": false,
-                            "properties": {
-                                "enabled": {
-                                    "type": "boolean",
-                                    "title": "Enable",
-                                    "default": true
-                                },
-                                "maxUnhealthy": {
-                                    "type": "string",
-                                    "title": "Maximum unhealthy nodes",
-                                    "examples": [
-                                        "40%"
-                                    ],
-                                    "default": "40%"
-                                },
-                                "nodeStartupTimeout": {
-                                    "type": "string",
-                                    "title": "Node startup timeout",
-                                    "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.",
-                                    "examples": [
-                                        "10m",
-                                        "100s"
-                                    ],
-                                    "default": "8m0s"
-                                },
-                                "unhealthyNotReadyTimeout": {
-                                    "type": "string",
-                                    "title": "Timeout for ready",
-                                    "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.",
-                                    "examples": [
-                                        "300s"
-                                    ],
-                                    "default": "10m0s"
-                                },
-                                "unhealthyUnknownTimeout": {
-                                    "type": "string",
-                                    "title": "Timeout for unknown condition",
-                                    "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.",
-                                    "examples": [
-                                        "300s"
-                                    ],
-                                    "default": "10m0s"
-                                }
-                            }
+                            "$ref": "#/$defs/machineHealthCheck",
+                            "title": "Machine health check"
                         },
                         "oidc": {
                             "type": "object",
diff --git a/helm/cluster-aws/values.yaml b/helm/cluster-aws/values.yaml
@@ -43,6 +43,8 @@ cluster:
         enable: true
       nodeExporter:
         enable: true
+      nodeProblemDetector:
+        enable: true
       observabilityBundle:
         enable: true
       observabilityPolicies:
@@ -296,6 +298,7 @@ global:
     netExporter: {}
     networkPolicies: {}
     nodeExporter: {}
+    nodeProblemDetector: {}
     observabilityBundle: {}
     observabilityPolicies: {}
     prometheusBlackboxExporter: {}
@@ -382,8 +385,10 @@ global:
     libVolumeSizeGB: 40
     logVolumeSizeGB: 15
     machineHealthCheck:
+      diskFullContainerdTimeout: ""
+      diskFullKubeletTimeout: ""
+      diskFullVarLogTimeout: ""
       enabled: true
-      maxUnhealthy: 40%
       nodeStartupTimeout: 8m0s
       unhealthyNotReadyTimeout: 10m0s
       unhealthyUnknownTimeout: 10m0s