Skip to content

Commit abb536b

Browse files
Merge pull request #499 from rolandmkunkel/SREP-215-finetune-the-hibernation-notification-in-CAD
SREP-215: always add comment to CHGM comment if the cluster was resum…
2 parents fc8ba3f + f09aa96 commit abb536b

File tree

3 files changed

+27
-64
lines changed

3 files changed

+27
-64
lines changed

pkg/investigations/chgm/chgm.go

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ func (c *Investiation) Run(r *investigation.Resources) (investigation.Investigat
6161
notes.AppendSuccess("Customer did not stop nodes.")
6262
logging.Info("The customer has not stopped/terminated any nodes.")
6363

64-
// 2. Check if the cluster is fresh out of a long hibernation
65-
// TODO(Claudio): OSD-18775 - add the note regardless of how long the cluster was hibernated, as long as it came just out of hibernation.
66-
longHibernation, err := investigateHibernation(r.Cluster, r.OcmClient)
64+
// 2. Check if the cluster was hibernated and has recently resumed.
65+
hibernationPeriods, err := getHibernationStatusForCluster(r.OcmClient, r.Cluster)
6766
if err != nil {
6867
logging.Warnf("could not check hibernation status of cluster: %w", err)
6968
}
70-
if longHibernation {
71-
logging.Info("The cluster was hibernated for too long.")
72-
notes.AppendWarning("Cluster was hibernated more than %.0f days - investigate CSRs and kubelet certificates: see https://github.com/openshift/ops-sop/blob/master/v4/alerts/cluster_has_gone_missing.md#24-hibernation", hibernationTooLong.Hours()/24)
69+
70+
if hasRecentlyResumed(hibernationPeriods, time.Now()) {
71+
logging.Info("The cluster has recently resumed from hibernation.")
72+
notes.AppendWarning("Cluster has resumed from hibernation within the last %.0f hours - investigate CSRs and kubelet certificates: see https://github.com/openshift/ops-sop/blob/master/v4/alerts/cluster_has_gone_missing.md#24-hibernation", recentWakeupTime.Hours())
7373
} else {
7474
logging.Info("The cluster was not hibernated for too long.")
7575
}
@@ -132,20 +132,17 @@ func (c *Investiation) IsExperimental() bool {
132132
return false
133133
}
134134

135-
// investigateHibernation checks if the cluster was recently woken up from
136-
// hibernation. If clusters are hibernated for more than 30 days, the internal
137-
// certificates of the kubelets can expire and CSRs need to be approved
135+
// hasRecentlyResumed checks if the cluster was woken up from
136+
// hibernation within the last 2h. In that case, the internal
137+
// certificates of the kubelets could have expired and CSRs need to be approved
138138
// manually:
139139
// - https://github.com/openshift/hive/blob/master/docs/hibernating-clusters.md
140-
func investigateHibernation(cluster *cmv1.Cluster, client ocm.Client) (bool, error) {
141-
hibernations, err := getHibernationStatusForCluster(client, cluster)
142-
if err != nil {
143-
return false, err
144-
}
145-
if len(hibernations) == 0 {
146-
return false, nil
140+
func hasRecentlyResumed(hibernationPeriods []*hibernationPeriod, now time.Time) bool {
141+
if len(hibernationPeriods) == 0 {
142+
return false
147143
}
148-
return hibernatedTooLong(hibernations, time.Now()), nil
144+
latestHibernation := hibernationPeriods[len(hibernationPeriods)-1]
145+
return now.Sub(latestHibernation.DehibernationTime) <= recentWakeupTime
149146
}
150147

151148
// isUserAllowedToStop verifies if a user is allowed to stop/terminate instances

pkg/investigations/chgm/chgm_hibernation_check.go

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@ import (
1111

1212
const recentWakeupTime = 2 * time.Hour
1313

14-
// 30 Days is always a problem as kubelet certificates will be expired
15-
const hibernationTooLong = 30 * 24 * time.Hour
16-
1714
const (
1815
hibernationStartEvent = "cluster_state_hibernating"
1916
hibernationEndEvent = "cluster_state_ready"
@@ -27,24 +24,6 @@ type hibernationPeriod struct {
2724
DehibernationTime time.Time
2825
}
2926

30-
func hibernatedTooLong(hibernations []*hibernationPeriod, now time.Time) bool {
31-
if len(hibernations) == 0 {
32-
return false
33-
}
34-
latestHibernation := hibernations[len(hibernations)-1]
35-
// The cluster was woken up within the RECENT_WAKEUP_TIME which might
36-
// indicate a CSR problem.
37-
if now.Sub(latestHibernation.DehibernationTime) >= recentWakeupTime {
38-
return false
39-
}
40-
// Only clusters that have hibernated for a long time are susceptible to
41-
// have cert issues.
42-
if latestHibernation.HibernationDuration >= hibernationTooLong {
43-
return true
44-
}
45-
return false
46-
}
47-
4827
func getHibernationStatusForCluster(ocmClient ocm.Client, cluster *cmv1.Cluster) ([]*hibernationPeriod, error) {
4928
filter := "log_type='cluster-state-updates'"
5029
clusterStateUpdates, err := ocmClient.GetServiceLog(cluster, filter)

pkg/investigations/chgm/chgm_hibernation_check_test.go

Lines changed: 13 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,10 @@ func TestHibernatedTooLong(t *testing.T) {
7171
now time.Time
7272
}
7373
hibernationStartTime := time.Date(2023, 0o1, 0o1, 0o0, 0o0, 0o0, 0o0, time.Local)
74-
hibernationShortStopTime := time.Date(2023, 0o1, 11, 0o0, 0o0, 0o0, 0o0, time.Local)
75-
hibernationLongStopTime := time.Date(2023, 0o2, 11, 0o0, 0o0, 0o0, 0o0, time.Local)
76-
shortHibernation := &hibernationPeriod{
77-
HibernationDuration: hibernationShortStopTime.Sub(hibernationStartTime),
78-
DehibernationTime: hibernationShortStopTime,
79-
}
80-
longHibernation := &hibernationPeriod{
81-
HibernationDuration: hibernationLongStopTime.Sub(hibernationStartTime),
82-
DehibernationTime: hibernationLongStopTime,
74+
hibernationStopTime := time.Date(2023, 0o2, 11, 0o0, 0o0, 0o0, 0o0, time.Local)
75+
hibernation := &hibernationPeriod{
76+
HibernationDuration: hibernationStopTime.Sub(hibernationStartTime),
77+
DehibernationTime: hibernationStopTime,
8378
}
8479
tests := []struct {
8580
name string
@@ -88,40 +83,32 @@ func TestHibernatedTooLong(t *testing.T) {
8883
}{
8984
// TODO: Add test cases.
9085
{
91-
name: "Cluster that hibernated for 10 days is ok",
86+
name: "Cluster with a hibernation that was longer ago does not count as recently resumed",
9287
args: args{
93-
hibernations: []*hibernationPeriod{shortHibernation},
94-
now: hibernationShortStopTime.Add(1 * time.Hour),
88+
hibernations: []*hibernationPeriod{hibernation},
89+
now: hibernationStopTime.Add(24 * time.Hour),
9590
},
9691
want: false,
9792
},
9893
{
99-
name: "Cluster that hibernated for 30+ days is too long",
94+
name: "Cluster that woke up 30 min ago counts as recently resumed",
10095
args: args{
101-
hibernations: []*hibernationPeriod{longHibernation},
102-
now: hibernationLongStopTime.Add(1 * time.Hour),
96+
hibernations: []*hibernationPeriod{hibernation},
97+
now: hibernationStopTime.Add(30 * time.Minute),
10398
},
10499
want: true,
105100
},
106101
{
107-
name: "Cluster that never hibernated is ok",
102+
name: "Cluster that never hibernated does not count as recently resumed",
108103
args: args{},
109104
want: false,
110105
},
111-
{
112-
name: "Cluster that woke up for 2+ hours ago ok",
113-
args: args{
114-
hibernations: []*hibernationPeriod{longHibernation},
115-
now: hibernationLongStopTime.Add(3 * time.Hour),
116-
},
117-
want: false,
118-
},
119106
}
120107
for _, tt := range tests {
121108
t.Run(tt.name, func(t *testing.T) {
122-
got := hibernatedTooLong(tt.args.hibernations, tt.args.now)
109+
got := hasRecentlyResumed(tt.args.hibernations, tt.args.now)
123110
if got != tt.want {
124-
t.Errorf("HibernatedTooLong() = %v, want %v", got, tt.want)
111+
t.Errorf("hasRecentlyResumed() = %v, want %v", got, tt.want)
125112
}
126113
})
127114
}

0 commit comments

Comments
 (0)