Skip to content

Commit e86cac6

Browse files
committed
Complete Phase 2: Core Monitoring Infrastructure
- Extended StoragePolicy CRD with metrics history and thresholds - CNPG cluster discovery via unstructured client - Prometheus metrics registration and helpers - Enhanced StoragePolicy controller reconciler - Fixed alerting test mocking Changes: 527 additions across 7 files
1 parent 25f3f84 commit e86cac6

File tree

7 files changed

+527
-2
lines changed

7 files changed

+527
-2
lines changed

api/v1alpha1/storagepolicy_types.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,38 @@ type AlertingConfig struct {
204204
EscalationMinutes int32 `json:"escalationMinutes,omitempty"`
205205
}
206206

207+
// BackupMonitoringConfig defines backup and WAL archiving monitoring settings
208+
type BackupMonitoringConfig struct {
209+
// Enabled determines if backup monitoring is enabled
210+
// +kubebuilder:default=true
211+
// +optional
212+
Enabled bool `json:"enabled,omitempty"`
213+
214+
// MaxBackupAgeHours is the maximum age of the last successful backup before alerting
215+
// Set to 0 to disable backup age monitoring
216+
// +kubebuilder:validation:Minimum=0
217+
// +kubebuilder:default=24
218+
// +optional
219+
MaxBackupAgeHours int32 `json:"maxBackupAgeHours,omitempty"`
220+
221+
// RequireContinuousArchiving alerts if WAL archiving is not working
222+
// +kubebuilder:default=true
223+
// +optional
224+
RequireContinuousArchiving bool `json:"requireContinuousArchiving,omitempty"`
225+
226+
// MaxRecoveryPointAgeHours is the maximum age of the first recovery point before alerting
227+
// Set to 0 to disable recovery point age monitoring
228+
// +kubebuilder:validation:Minimum=0
229+
// +kubebuilder:default=168
230+
// +optional
231+
MaxRecoveryPointAgeHours int32 `json:"maxRecoveryPointAgeHours,omitempty"`
232+
233+
// AlertOnNoBackupConfigured alerts if a cluster has no backup configured
234+
// +kubebuilder:default=true
235+
// +optional
236+
AlertOnNoBackupConfigured bool `json:"alertOnNoBackupConfigured,omitempty"`
237+
}
238+
207239
// StoragePolicySpec defines the desired state of StoragePolicy
208240
type StoragePolicySpec struct {
209241
// Selector is a label selector for matching CNPG clusters
@@ -226,6 +258,10 @@ type StoragePolicySpec struct {
226258
// +optional
227259
WALCleanup WALCleanupConfig `json:"walCleanup,omitempty"`
228260

261+
// BackupMonitoring defines backup and WAL archiving monitoring settings
262+
// +optional
263+
BackupMonitoring BackupMonitoringConfig `json:"backupMonitoring,omitempty"`
264+
229265
// CircuitBreaker defines circuit breaker settings
230266
// +optional
231267
CircuitBreaker CircuitBreakerConfig `json:"circuitBreaker,omitempty"`
@@ -256,6 +292,37 @@ type ManagedCluster struct {
256292

257293
// Status is the current status of the cluster
258294
Status string `json:"status"`
295+
296+
// BackupStatus contains backup-related status information
297+
// +optional
298+
BackupStatus *ClusterBackupStatus `json:"backupStatus,omitempty"`
299+
}
300+
301+
// ClusterBackupStatus contains backup and WAL archiving status for a cluster
302+
type ClusterBackupStatus struct {
303+
// LastBackupTime is the timestamp of the last successful backup
304+
// +optional
305+
LastBackupTime *metav1.Time `json:"lastBackupTime,omitempty"`
306+
307+
// LastBackupAgeHours is how many hours since the last backup
308+
// +optional
309+
LastBackupAgeHours int32 `json:"lastBackupAgeHours,omitempty"`
310+
311+
// FirstRecoverabilityPoint is the timestamp of the oldest recoverable point
312+
// +optional
313+
FirstRecoverabilityPoint *metav1.Time `json:"firstRecoverabilityPoint,omitempty"`
314+
315+
// ContinuousArchivingWorking indicates if WAL archiving is functioning
316+
// +optional
317+
ContinuousArchivingWorking bool `json:"continuousArchivingWorking,omitempty"`
318+
319+
// BackupConfigured indicates if backups are configured for the cluster
320+
// +optional
321+
BackupConfigured bool `json:"backupConfigured,omitempty"`
322+
323+
// BackupStatus is the overall backup health status
324+
// +optional
325+
BackupHealthStatus string `json:"backupHealthStatus,omitempty"`
259326
}
260327

261328
// StoragePolicyStatus defines the observed state of StoragePolicy

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/cnpg.supporttools.io_storagepolicies.yaml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,41 @@ spec:
106106
remediation is active
107107
type: boolean
108108
type: object
109+
backupMonitoring:
110+
description: BackupMonitoring defines backup and WAL archiving monitoring
111+
settings
112+
properties:
113+
alertOnNoBackupConfigured:
114+
default: true
115+
description: AlertOnNoBackupConfigured alerts if a cluster has
116+
no backup configured
117+
type: boolean
118+
enabled:
119+
default: true
120+
description: Enabled determines if backup monitoring is enabled
121+
type: boolean
122+
maxBackupAgeHours:
123+
default: 24
124+
description: |-
125+
MaxBackupAgeHours is the maximum age of the last successful backup before alerting
126+
Set to 0 to disable backup age monitoring
127+
format: int32
128+
minimum: 0
129+
type: integer
130+
maxRecoveryPointAgeHours:
131+
default: 168
132+
description: |-
133+
MaxRecoveryPointAgeHours is the maximum age of the first recovery point before alerting
134+
Set to 0 to disable recovery point age monitoring
135+
format: int32
136+
minimum: 0
137+
type: integer
138+
requireContinuousArchiving:
139+
default: true
140+
description: RequireContinuousArchiving alerts if WAL archiving
141+
is not working
142+
type: boolean
143+
type: object
109144
circuitBreaker:
110145
description: CircuitBreaker defines circuit breaker settings
111146
properties:
@@ -370,6 +405,36 @@ spec:
370405
description: ManagedCluster represents a cluster managed by this
371406
policy
372407
properties:
408+
backupStatus:
409+
description: BackupStatus contains backup-related status information
410+
properties:
411+
backupConfigured:
412+
description: BackupConfigured indicates if backups are configured
413+
for the cluster
414+
type: boolean
415+
backupHealthStatus:
416+
description: BackupStatus is the overall backup health status
417+
type: string
418+
continuousArchivingWorking:
419+
description: ContinuousArchivingWorking indicates if WAL
420+
archiving is functioning
421+
type: boolean
422+
firstRecoverabilityPoint:
423+
description: FirstRecoverabilityPoint is the timestamp of
424+
the oldest recoverable point
425+
format: date-time
426+
type: string
427+
lastBackupAgeHours:
428+
description: LastBackupAgeHours is how many hours since
429+
the last backup
430+
format: int32
431+
type: integer
432+
lastBackupTime:
433+
description: LastBackupTime is the timestamp of the last
434+
successful backup
435+
format: date-time
436+
type: string
437+
type: object
373438
lastChecked:
374439
description: LastChecked is when the cluster was last evaluated
375440
format: date-time

0 commit comments

Comments
 (0)