Skip to content

Commit 106d42f

Browse files
slintesjaypoulz
andauthored
OCPEDGE-2033: [TNF] Use operator condition to track etcd container removal in ExternalEtcd clusters (#1484)
* TNF: Introduce job readiness condition for etcd container removal Introduce a new mechanism for the TNF setup job to signal CEO when it's ready for etcd container removal by setting a job status condition. This replaces the previous approach of using unsupportedConfigOverrides. Main changes: - set "ReadyForEtcdContainerRemoval" condition on setup job when pacemaker is configured to take over the etcd container - refactor DualReplica main component: - keep track of cluster status - wait for bootstrap completed before starting job controllers (instead of doing that in the auth job, for easier status tracking) - add job informer and dual replica status tracker to TargetConfigController and ExternalEtcdSupportController, for correct handling of the etcd container on job updates based on current dual replica cluster status Signed-off-by: Marc Sluiter <[email protected]> * TNF: Use operator condition instead of job status The problem is that the job status isn't persistent. On CEO updates, the job is recreated. That means that we don't know if pacemaker is running the etcd container already or not, until the job condition is set. On initial cluster installation, pacemaker's etcd isn't running yet, but on updates it is. In the latter case we have to prevent that CEO re-adds its own etcd container. Using an operator condition is persistent and doesn't need workarounds for handling both initial installation and CEO updates. Signed-off-by: Marc Sluiter <[email protected]> * TNF: better naming for DualReplica vs ExternalEtcd For differentiating between DualReplica / TNF specific parts, and the more generic ExternalEtcd parts. Signed-off-by: Marc Sluiter <[email protected]> * TNF: don't check feature gate Signed-off-by: Marc Sluiter <[email protected]> * TNF: run long running event handler in a go routine Signed-off-by: Marc Sluiter <[email protected]> * Fixed unneeded else, and typo Signed-off-by: Marc Sluiter <[email protected]> * TNF: clarify condition name, and align related wording everywhere Signed-off-by: Marc Sluiter <[email protected]> * Reworked external etcd to drive status lookups through ceohelpers - Removed status package (pkg/tnf/pkg/status/) in favor of stateless functions in ceohelpers (pkg/operator/ceohelpers/external_etcd_status.go) - Moved informer initialization earlier in the operator startup sequence to ensure all required informers are running before controllers that depend on them - Added new operator condition constant OperatorConditionEtcdRunningInCluster to track bootstrap completion status - Updated test utilities with WithOperatorCondition() helper function to simplify test setup for operator conditions - Refactored tests to use the new status checking approach, removing mock status objects in favor of operator conditions --------- Signed-off-by: Marc Sluiter <[email protected]> Co-authored-by: Jeremy Poulin <[email protected]>
1 parent 1ba58f9 commit 106d42f

20 files changed

+796
-373
lines changed

bindata/tnfdeployment/clusterrole.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@ rules:
2020
verbs:
2121
- get
2222
- list
23+
- watch
24+
- apiGroups:
25+
- operator.openshift.io
26+
resources:
27+
- etcds/status
28+
verbs:
29+
- get
2330
- patch
2431
- update
25-
- watch
2632
- apiGroups:
2733
- config.openshift.io
2834
resources:
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package ceohelpers
2+
3+
import (
4+
"context"
5+
6+
configv1listers "github.com/openshift/client-go/config/listers/config/v1"
7+
"github.com/openshift/library-go/pkg/operator/v1helpers"
8+
"k8s.io/klog/v2"
9+
10+
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/etcd"
11+
)
12+
13+
type ExternalEtcdClusterStatus struct {
14+
IsExternalEtcdCluster bool
15+
IsEtcdRunningInCluster bool
16+
IsReadyForEtcdTransition bool
17+
}
18+
19+
// IsExternalEtcdCluster determines if the cluster is configured for external etcd
20+
// by checking if the control plane topology is set to DualReplicaTopologyMode.
21+
// This indicates that the cluster is using Two Node Fencing (TNF) with external etcd.
22+
func IsExternalEtcdCluster(ctx context.Context, infraLister configv1listers.InfrastructureLister) (bool, error) {
23+
dualReplicaEnabled, err := IsDualReplicaTopology(ctx, infraLister)
24+
if err != nil {
25+
klog.Errorf("failed to determine DualReplicaTopology: %v", err)
26+
return false, err
27+
}
28+
29+
if dualReplicaEnabled {
30+
klog.V(4).Infof("detected DualReplica topology - external etcd cluster")
31+
}
32+
33+
return dualReplicaEnabled, nil
34+
}
35+
36+
// IsReadyForEtcdTransition checks if the cluster is ready for etcd transition
37+
// by examining the operator status for the ExternalEtcdReadyForTransition condition.
38+
// This condition is set when the TNF setup is ready to take over the etcd container.
39+
func IsReadyForEtcdTransition(ctx context.Context, operatorClient v1helpers.StaticPodOperatorClient) (bool, error) {
40+
_, opStatus, _, err := operatorClient.GetStaticPodOperatorState()
41+
if err != nil {
42+
klog.Errorf("failed to get static pod operator state: %v", err)
43+
return false, err
44+
}
45+
46+
readyForEtcdTransition := v1helpers.IsOperatorConditionTrue(opStatus.Conditions, etcd.OperatorConditionExternalEtcdReadyForTransition)
47+
if readyForEtcdTransition {
48+
klog.V(4).Infof("ready for etcd transition")
49+
}
50+
51+
return readyForEtcdTransition, nil
52+
}
53+
54+
// IsEtcdRunningInCluster checks if the etcd bootstrap process is completed
55+
// by examining the operator status for the EtcdRunningInCluster condition.
56+
func IsEtcdRunningInCluster(ctx context.Context, operatorClient v1helpers.StaticPodOperatorClient) (bool, error) {
57+
_, opStatus, _, err := operatorClient.GetStaticPodOperatorState()
58+
if err != nil {
59+
klog.Errorf("failed to get static pod operator state: %v", err)
60+
return false, err
61+
}
62+
63+
etcdRunningInCluster := v1helpers.IsOperatorConditionTrue(opStatus.Conditions, etcd.OperatorConditionEtcdRunningInCluster)
64+
if etcdRunningInCluster {
65+
klog.V(4).Infof("bootstrap completed, etcd running in cluster")
66+
}
67+
68+
return etcdRunningInCluster, nil
69+
}
70+
71+
// GetExternalEtcdClusterStatus provides a comprehensive status check for external etcd clusters.
72+
// It returns the external etcd status, bootstrap completion status, and readiness for transition.
73+
func GetExternalEtcdClusterStatus(ctx context.Context,
74+
operatorClient v1helpers.StaticPodOperatorClient,
75+
infraLister configv1listers.InfrastructureLister) (externalEtcdStatus ExternalEtcdClusterStatus, err error) {
76+
77+
externalEtcdStatus = ExternalEtcdClusterStatus{
78+
IsExternalEtcdCluster: false,
79+
IsEtcdRunningInCluster: false,
80+
IsReadyForEtcdTransition: false,
81+
}
82+
83+
// Check if this is an external etcd cluster
84+
externalEtcdStatus.IsExternalEtcdCluster, err = IsExternalEtcdCluster(ctx, infraLister)
85+
if err != nil {
86+
return externalEtcdStatus, err
87+
}
88+
89+
// If not external etcd, return early
90+
if !externalEtcdStatus.IsExternalEtcdCluster {
91+
return externalEtcdStatus, nil
92+
}
93+
94+
// Get operator status once for both bootstrap and transition checks
95+
_, opStatus, _, err := operatorClient.GetStaticPodOperatorState()
96+
if err != nil {
97+
klog.Errorf("failed to get static pod operator state: %v", err)
98+
return externalEtcdStatus, err
99+
}
100+
101+
// Check bootstrap completion
102+
externalEtcdStatus.IsEtcdRunningInCluster = v1helpers.IsOperatorConditionTrue(opStatus.Conditions, etcd.OperatorConditionEtcdRunningInCluster)
103+
104+
// Check readiness for transition
105+
externalEtcdStatus.IsReadyForEtcdTransition = v1helpers.IsOperatorConditionTrue(opStatus.Conditions, etcd.OperatorConditionExternalEtcdReadyForTransition)
106+
107+
if externalEtcdStatus.IsEtcdRunningInCluster {
108+
klog.V(4).Infof("bootstrap completed, etcd running in cluster")
109+
}
110+
if externalEtcdStatus.IsReadyForEtcdTransition {
111+
klog.V(4).Infof("ready for etcd transition")
112+
}
113+
114+
return externalEtcdStatus, nil
115+
}
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
package ceohelpers
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
configv1 "github.com/openshift/api/config/v1"
8+
operatorv1 "github.com/openshift/api/operator/v1"
9+
"github.com/stretchr/testify/require"
10+
11+
"github.com/openshift/cluster-etcd-operator/pkg/testutils"
12+
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/etcd"
13+
)
14+
15+
func TestIsExternalEtcdCluster(t *testing.T) {
16+
tests := []struct {
17+
name string
18+
topology configv1.TopologyMode
19+
expectedResult bool
20+
expectError bool
21+
}{
22+
{
23+
name: "HighlyAvailable topology - not external etcd",
24+
topology: configv1.HighlyAvailableTopologyMode,
25+
expectedResult: false,
26+
expectError: false,
27+
},
28+
{
29+
name: "DualReplica topology - external etcd",
30+
topology: configv1.DualReplicaTopologyMode,
31+
expectedResult: true,
32+
expectError: false,
33+
},
34+
{
35+
name: "SingleReplica topology - not external etcd",
36+
topology: configv1.SingleReplicaTopologyMode,
37+
expectedResult: false,
38+
expectError: false,
39+
},
40+
}
41+
42+
for _, tt := range tests {
43+
t.Run(tt.name, func(t *testing.T) {
44+
// Create fake infrastructure lister
45+
infraLister := testutils.FakeInfrastructureLister(t, tt.topology)
46+
47+
// Test the function
48+
result, err := IsExternalEtcdCluster(context.Background(), infraLister)
49+
50+
if tt.expectError {
51+
require.Error(t, err)
52+
} else {
53+
require.NoError(t, err)
54+
require.Equal(t, tt.expectedResult, result)
55+
}
56+
})
57+
}
58+
}
59+
60+
func TestIsReadyForEtcdTransition(t *testing.T) {
61+
tests := []struct {
62+
name string
63+
operatorConditions []operatorv1.OperatorCondition
64+
expectedResult bool
65+
expectError bool
66+
}{
67+
{
68+
name: "No conditions - not ready",
69+
operatorConditions: []operatorv1.OperatorCondition{},
70+
expectedResult: false,
71+
expectError: false,
72+
},
73+
{
74+
name: "ExternalEtcdReadyForTransition condition true - ready",
75+
operatorConditions: []operatorv1.OperatorCondition{
76+
{
77+
Type: etcd.OperatorConditionExternalEtcdReadyForTransition,
78+
Status: operatorv1.ConditionTrue,
79+
},
80+
},
81+
expectedResult: true,
82+
expectError: false,
83+
},
84+
{
85+
name: "ExternalEtcdReadyForTransition condition false - not ready",
86+
operatorConditions: []operatorv1.OperatorCondition{
87+
{
88+
Type: etcd.OperatorConditionExternalEtcdReadyForTransition,
89+
Status: operatorv1.ConditionFalse,
90+
},
91+
},
92+
expectedResult: false,
93+
expectError: false,
94+
},
95+
{
96+
name: "Other conditions present but not ExternalEtcdReadyForTransition - not ready",
97+
operatorConditions: []operatorv1.OperatorCondition{
98+
{
99+
Type: etcd.OperatorConditionEtcdRunningInCluster,
100+
Status: operatorv1.ConditionTrue,
101+
},
102+
},
103+
expectedResult: false,
104+
expectError: false,
105+
},
106+
}
107+
108+
for _, tt := range tests {
109+
t.Run(tt.name, func(t *testing.T) {
110+
// Create fake operator client
111+
operatorClient := testutils.FakeStaticPodOperatorClient(t, tt.operatorConditions)
112+
113+
// Test the function
114+
result, err := IsReadyForEtcdTransition(context.Background(), operatorClient)
115+
116+
if tt.expectError {
117+
require.Error(t, err)
118+
} else {
119+
require.NoError(t, err)
120+
require.Equal(t, tt.expectedResult, result)
121+
}
122+
})
123+
}
124+
}
125+
126+
func TestIsEtcdRunningInCluster(t *testing.T) {
127+
tests := []struct {
128+
name string
129+
operatorConditions []operatorv1.OperatorCondition
130+
expectedResult bool
131+
expectError bool
132+
}{
133+
{
134+
name: "No conditions - not completed",
135+
operatorConditions: []operatorv1.OperatorCondition{},
136+
expectedResult: false,
137+
expectError: false,
138+
},
139+
{
140+
name: "EtcdRunningInCluster condition true - completed",
141+
operatorConditions: []operatorv1.OperatorCondition{
142+
{
143+
Type: etcd.OperatorConditionEtcdRunningInCluster,
144+
Status: operatorv1.ConditionTrue,
145+
},
146+
},
147+
expectedResult: true,
148+
expectError: false,
149+
},
150+
{
151+
name: "EtcdRunningInCluster condition false - not completed",
152+
operatorConditions: []operatorv1.OperatorCondition{
153+
{
154+
Type: etcd.OperatorConditionEtcdRunningInCluster,
155+
Status: operatorv1.ConditionFalse,
156+
},
157+
},
158+
expectedResult: false,
159+
expectError: false,
160+
},
161+
}
162+
163+
for _, tt := range tests {
164+
t.Run(tt.name, func(t *testing.T) {
165+
// Create fake operator client
166+
operatorClient := testutils.FakeStaticPodOperatorClient(t, tt.operatorConditions)
167+
168+
// Test the function
169+
result, err := IsEtcdRunningInCluster(context.Background(), operatorClient)
170+
171+
if tt.expectError {
172+
require.Error(t, err)
173+
} else {
174+
require.NoError(t, err)
175+
require.Equal(t, tt.expectedResult, result)
176+
}
177+
})
178+
}
179+
}
180+
181+
func TestGetExternalEtcdClusterStatus(t *testing.T) {
182+
tests := []struct {
183+
name string
184+
topology configv1.TopologyMode
185+
operatorConditions []operatorv1.OperatorCondition
186+
expectedExternalEtcd bool
187+
expectedEtcdRunningInCluster bool
188+
expectedReadyForTransition bool
189+
expectError bool
190+
}{
191+
{
192+
name: "HighlyAvailable topology - not external etcd",
193+
topology: configv1.HighlyAvailableTopologyMode,
194+
operatorConditions: []operatorv1.OperatorCondition{},
195+
expectedExternalEtcd: false,
196+
expectedEtcdRunningInCluster: false,
197+
expectedReadyForTransition: false,
198+
expectError: false,
199+
},
200+
{
201+
name: "DualReplica topology with no conditions",
202+
topology: configv1.DualReplicaTopologyMode,
203+
operatorConditions: []operatorv1.OperatorCondition{},
204+
expectedExternalEtcd: true,
205+
expectedEtcdRunningInCluster: false,
206+
expectedReadyForTransition: false,
207+
expectError: false,
208+
},
209+
{
210+
name: "DualReplica topology with bootstrap completed",
211+
topology: configv1.DualReplicaTopologyMode,
212+
operatorConditions: []operatorv1.OperatorCondition{
213+
{
214+
Type: etcd.OperatorConditionEtcdRunningInCluster,
215+
Status: operatorv1.ConditionTrue,
216+
},
217+
},
218+
expectedExternalEtcd: true,
219+
expectedEtcdRunningInCluster: true,
220+
expectedReadyForTransition: false,
221+
expectError: false,
222+
},
223+
{
224+
name: "DualReplica topology with bootstrap completed and ready for transition",
225+
topology: configv1.DualReplicaTopologyMode,
226+
operatorConditions: []operatorv1.OperatorCondition{
227+
{
228+
Type: etcd.OperatorConditionEtcdRunningInCluster,
229+
Status: operatorv1.ConditionTrue,
230+
},
231+
{
232+
Type: etcd.OperatorConditionExternalEtcdReadyForTransition,
233+
Status: operatorv1.ConditionTrue,
234+
},
235+
},
236+
expectedExternalEtcd: true,
237+
expectedEtcdRunningInCluster: true,
238+
expectedReadyForTransition: true,
239+
expectError: false,
240+
},
241+
}
242+
243+
for _, tt := range tests {
244+
t.Run(tt.name, func(t *testing.T) {
245+
// Create fake infrastructure lister
246+
infraLister := testutils.FakeInfrastructureLister(t, tt.topology)
247+
248+
// Create fake operator client
249+
operatorClient := testutils.FakeStaticPodOperatorClient(t, tt.operatorConditions)
250+
251+
// Test the function
252+
externalEtcdStatus, err := GetExternalEtcdClusterStatus(
253+
context.Background(), operatorClient, infraLister)
254+
255+
if tt.expectError {
256+
require.Error(t, err)
257+
} else {
258+
require.NoError(t, err)
259+
require.Equal(t, tt.expectedExternalEtcd, externalEtcdStatus.IsExternalEtcdCluster)
260+
require.Equal(t, tt.expectedEtcdRunningInCluster, externalEtcdStatus.IsEtcdRunningInCluster)
261+
require.Equal(t, tt.expectedReadyForTransition, externalEtcdStatus.IsReadyForEtcdTransition)
262+
}
263+
})
264+
}
265+
}

0 commit comments

Comments
 (0)