|
168 | 168 | } |
169 | 169 | } |
170 | 170 | }, |
| 171 | + "machineHealthCheck": { |
| 172 | + "type": "object", |
| 173 | + "title": "Machine health check", |
| 174 | + "required": [ |
| 175 | + "enabled" |
| 176 | + ], |
| 177 | + "properties": { |
| 178 | + "diskFullContainerdTimeout": { |
| 179 | + "type": "string", |
| 180 | + "title": "DiskFullContainerd timeout", |
| 181 | + "description": "Determines how long a machine health check should wait for a node with condition DiskFullContainerd=True before considering a machine unhealthy. Use an empty value to not consider this condition.", |
| 182 | + "examples": [ |
| 183 | + "10m", |
| 184 | + "100s" |
| 185 | + ], |
| 186 | + "default": "" |
| 187 | + }, |
| 188 | + "diskFullKubeletTimeout": { |
| 189 | + "type": "string", |
| 190 | + "title": "DiskFullKubelet timeout", |
| 191 | + "description": "Determines how long a machine health check should wait for a node with condition DiskFullKubelet=True before considering a machine unhealthy. Use an empty value to not consider this condition.", |
| 192 | + "examples": [ |
| 193 | + "10m", |
| 194 | + "100s" |
| 195 | + ], |
| 196 | + "default": "" |
| 197 | + }, |
| 198 | + "diskFullVarLogTimeout": { |
| 199 | + "type": "string", |
| 200 | + "title": "DiskFullVarLog timeout", |
| 201 | + "description": "Determines how long a machine health check should wait for a node with condition DiskFullVarLog=True before considering a machine unhealthy. Use an empty value to not consider this condition.", |
| 202 | + "examples": [ |
| 203 | + "10m", |
| 204 | + "100s" |
| 205 | + ], |
| 206 | + "default": "" |
| 207 | + }, |
| 208 | + "enabled": { |
| 209 | + "type": "boolean", |
| 210 | + "title": "Enable", |
| 211 | + "default": true |
| 212 | + }, |
| 213 | + "maxUnhealthy": { |
| 214 | + "type": "string", |
| 215 | + "title": "Maximum unhealthy nodes", |
| 216 | + "description": "Defaults to 40% for control plane nodes and 20% for worker nodes.", |
| 217 | + "examples": [ |
| 218 | + "40%" |
| 219 | + ] |
| 220 | + }, |
| 221 | + "nodeStartupTimeout": { |
| 222 | + "type": "string", |
| 223 | + "title": "Node startup timeout", |
| 224 | + "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.", |
| 225 | + "examples": [ |
| 226 | + "10m", |
| 227 | + "100s" |
| 228 | + ], |
| 229 | + "default": "8m0s" |
| 230 | + }, |
| 231 | + "unhealthyNotReadyTimeout": { |
| 232 | + "type": "string", |
| 233 | + "title": "Timeout for ready", |
| 234 | + "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.", |
| 235 | + "examples": [ |
| 236 | + "300s" |
| 237 | + ], |
| 238 | + "default": "10m0s" |
| 239 | + }, |
| 240 | + "unhealthyUnknownTimeout": { |
| 241 | + "type": "string", |
| 242 | + "title": "Timeout for unknown condition", |
| 243 | + "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.", |
| 244 | + "examples": [ |
| 245 | + "300s" |
| 246 | + ], |
| 247 | + "default": "10m0s" |
| 248 | + } |
| 249 | + } |
| 250 | + }, |
171 | 251 | "machinePool": { |
172 | 252 | "type": "object", |
173 | 253 | "title": "Node pool", |
|
664 | 744 | "description": "Size of the volume mounted at `/var/log` on the worker nodes.", |
665 | 745 | "default": 30 |
666 | 746 | }, |
| 747 | + "machineHealthCheck": { |
| 748 | + "$ref": "#/$defs/machineHealthCheck", |
| 749 | + "title": "Machine health check" |
| 750 | + }, |
667 | 751 | "maxHealthyPercentage": { |
668 | 752 | "type": "integer", |
669 | 753 | "title": "Maximum percentage of instances that can be in service when replacing instances.", |
|
834 | 918 | "nodeExporter": { |
835 | 919 | "enable": true |
836 | 920 | }, |
| 921 | + "nodeProblemDetector": { |
| 922 | + "enable": true |
| 923 | + }, |
837 | 924 | "observabilityBundle": { |
838 | 925 | "enable": true |
839 | 926 | }, |
|
1365 | 1452 | "title": "node-exporter", |
1366 | 1453 | "description": "Configuration of node-exporter. For all available values see https://github.com/giantswarm/node-exporter-app." |
1367 | 1454 | }, |
| 1455 | + "nodeProblemDetector": { |
| 1456 | + "$ref": "#/$defs/helmRelease", |
| 1457 | + "type": "object", |
| 1458 | + "title": "node-problem-detector", |
| 1459 | + "description": "Configuration of node-problem-detector-app. For all available values see https://github.com/giantswarm/node-problem-detector-app." |
| 1460 | + }, |
1368 | 1461 | "observabilityBundle": { |
1369 | 1462 | "$ref": "#/$defs/app", |
1370 | 1463 | "type": "object", |
|
2164 | 2257 | "default": 15 |
2165 | 2258 | }, |
2166 | 2259 | "machineHealthCheck": { |
2167 | | - "type": "object", |
2168 | | - "title": "Machine health check", |
2169 | | - "additionalProperties": false, |
2170 | | - "properties": { |
2171 | | - "enabled": { |
2172 | | - "type": "boolean", |
2173 | | - "title": "Enable", |
2174 | | - "default": true |
2175 | | - }, |
2176 | | - "maxUnhealthy": { |
2177 | | - "type": "string", |
2178 | | - "title": "Maximum unhealthy nodes", |
2179 | | - "examples": [ |
2180 | | - "40%" |
2181 | | - ], |
2182 | | - "default": "40%" |
2183 | | - }, |
2184 | | - "nodeStartupTimeout": { |
2185 | | - "type": "string", |
2186 | | - "title": "Node startup timeout", |
2187 | | - "description": "Determines how long a machine health check should wait for a node to join the cluster, before considering a machine unhealthy.", |
2188 | | - "examples": [ |
2189 | | - "10m", |
2190 | | - "100s" |
2191 | | - ], |
2192 | | - "default": "8m0s" |
2193 | | - }, |
2194 | | - "unhealthyNotReadyTimeout": { |
2195 | | - "type": "string", |
2196 | | - "title": "Timeout for ready", |
2197 | | - "description": "If a node is not in condition 'Ready' after this timeout, it will be considered unhealthy.", |
2198 | | - "examples": [ |
2199 | | - "300s" |
2200 | | - ], |
2201 | | - "default": "10m0s" |
2202 | | - }, |
2203 | | - "unhealthyUnknownTimeout": { |
2204 | | - "type": "string", |
2205 | | - "title": "Timeout for unknown condition", |
2206 | | - "description": "If a node is in 'Unknown' condition after this timeout, it will be considered unhealthy.", |
2207 | | - "examples": [ |
2208 | | - "300s" |
2209 | | - ], |
2210 | | - "default": "10m0s" |
2211 | | - } |
2212 | | - } |
| 2260 | + "$ref": "#/$defs/machineHealthCheck", |
| 2261 | + "title": "Machine health check" |
2213 | 2262 | }, |
2214 | 2263 | "oidc": { |
2215 | 2264 | "type": "object", |
|
0 commit comments