Skip to content

Commit 58409ef

Browse files
Merge pull request #7260 from sdminonne/CNTRLPLANE-1956
CNTRLPLANE-1956: add logic to check control-plane to data-plane connectivity
2 parents 7424e93 + 0d5f3d6 commit 58409ef

File tree

6 files changed

+349
-1
lines changed

6 files changed

+349
-1
lines changed

api/hypershift/v1beta1/hostedcluster_conditions.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,15 @@ const (
195195
// This condition is used to track the status of the recovery process and to determine if the HostedCluster
196196
// is ready to be used after restoration.
197197
HostedClusterRestoredFromBackup ConditionType = "HostedClusterRestoredFromBackup"
198+
199+
// DataPlaneConnectionAvailable indicates whether the control plane has a successful
200+
// network connection to the data plane components.
201+
// **True** means the control plane can successfully reach the data plane nodes.
202+
// **False** means there are network connection issues preventing the control plane from reaching the data plane.
203+
// A failure here suggests potential issues such as: network policy restrictions,
204+
// firewall rules, missing data plane nodes, or problems with infrastructure
205+
// components like the konnectivity-agent workload.
206+
DataPlaneConnectionAvailable ConditionType = "DataPlaneConnectionAvailable"
198207
)
199208

200209
// Reasons.
@@ -250,7 +259,15 @@ const (
250259

251260
RecoveryFinishedReason = "RecoveryFinished"
252261

262+
ReconcileErrorReason = "ReconcileError"
263+
253264
CloudResourcesCleanupSkippedReason = "CloudResourcesCleanupSkipped"
265+
266+
DataPlaneConnectionNoKonnectivityAgentPodsNotFoundReason = "KonnectivityAgentPodsNotFound"
267+
268+
DataPlaneConnectionLogsAccessFailedReason = "LogsAccessFailed"
269+
270+
DataPlaneConnectionNoWorkerNodesAvailableReason = "NoWorkerNodesAvailable"
254271
)
255272

256273
// Messages.

control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ import (
7171
"k8s.io/apimachinery/pkg/types"
7272
utilerrors "k8s.io/apimachinery/pkg/util/errors"
7373
"k8s.io/apimachinery/pkg/util/sets"
74+
clientset "k8s.io/client-go/kubernetes"
7475
"k8s.io/client-go/rest"
7576
apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1"
7677
"k8s.io/utils/ptr"
@@ -127,6 +128,7 @@ exec /bin/azure-cloud-node-manager \
127128
type reconciler struct {
128129
client client.Client
129130
uncachedClient client.Client
131+
clientSet *clientset.Clientset
130132
upsert.CreateOrUpdateProvider
131133
platformType hyperv1.PlatformType
132134
rootCA string
@@ -144,6 +146,18 @@ type reconciler struct {
144146
operateOnReleaseImage string
145147
ImageMetaDataProvider util.ImageMetadataProvider
146148
cleanupTracker *util.CleanupTracker
149+
150+
// exposed for unit test since GetLogs looks hard to be mocked
151+
GetPodLogs func(context context.Context, clientset *clientset.Clientset, namespace, name, container string) ([]byte, error)
152+
}
153+
154+
func getPodLogs(ctx context.Context, clientSet *clientset.Clientset, namespace, name, container string) ([]byte, error) {
155+
limit := int64(1024)
156+
opts := &corev1.PodLogOptions{
157+
Container: container,
158+
LimitBytes: &limit,
159+
}
160+
return clientSet.CoreV1().Pods(namespace).GetLogs(name, opts).DoRaw(ctx)
147161
}
148162

149163
// eventHandler is the handler used throughout. As this controller reconciles all kind of different resources
@@ -181,9 +195,15 @@ func Setup(ctx context.Context, opts *operator.HostedClusterConfigOperatorConfig
181195
return fmt.Errorf("failed to create kubevirt infra uncached client: %w", err)
182196
}
183197

198+
clientset, err := clientset.NewForConfig(opts.Manager.GetConfig())
199+
if err != nil {
200+
return fmt.Errorf("failed to initialize kubeClient from config: %w", err)
201+
}
202+
184203
c, err := controller.New(ControllerName, opts.Manager, controller.Options{Reconciler: &reconciler{
185204
client: opts.Manager.GetClient(),
186205
uncachedClient: uncachedClient,
206+
clientSet: clientset,
187207
CreateOrUpdateProvider: opts.TargetCreateOrUpdateProvider,
188208
platformType: opts.PlatformType,
189209
rootCA: opts.InitialCA,
@@ -201,6 +221,7 @@ func Setup(ctx context.Context, opts *operator.HostedClusterConfigOperatorConfig
201221
operateOnReleaseImage: opts.OperateOnReleaseImage,
202222
ImageMetaDataProvider: opts.ImageMetaDataProvider,
203223
cleanupTracker: util.NewCleanupTracker(),
224+
GetPodLogs: getPodLogs,
204225
}})
205226
if err != nil {
206227
return fmt.Errorf("failed to construct controller: %w", err)
@@ -539,6 +560,11 @@ func (r *reconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result
539560
errs = append(errs, fmt.Errorf("failed to reconcile konnectivity agent: %w", err))
540561
}
541562

563+
log.Info("reconciling control-Plane to data-Plane status conditions")
564+
if err := r.reconcileControlPlaneDataPlaneConnectivityConditions(ctx, hcp, log); err != nil {
565+
errs = append(errs, fmt.Errorf("failed to update ControlPlaneToDataPlaneConnectivity condition: %w", err))
566+
}
567+
542568
log.Info("reconciling openshift apiserver apiservices")
543569
if err := r.reconcileOpenshiftAPIServerAPIServices(ctx, hcp); err != nil {
544570
errs = append(errs, fmt.Errorf("failed to reconcile openshift apiserver service: %w", err))
@@ -1389,6 +1415,79 @@ func (r *reconciler) reconcileClusterVersion(ctx context.Context, hcp *hyperv1.H
13891415
return nil
13901416
}
13911417

1418+
func (r *reconciler) reconcileControlPlaneDataPlaneConnectivityConditions(ctx context.Context, hcp *hyperv1.HostedControlPlane, log logr.Logger) error {
1419+
patchHCPWithCondition := func(hcp *hyperv1.HostedControlPlane, condition *metav1.Condition) error {
1420+
originalHCP := hcp.DeepCopy()
1421+
if !meta.SetStatusCondition(&hcp.Status.Conditions, *condition) {
1422+
return nil // No status change; avoid unnecessary API call.
1423+
}
1424+
if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFrom(originalHCP)); err != nil {
1425+
return fmt.Errorf("failed to update HostedControlPlane status with %s condition: %w", condition.Type, err)
1426+
}
1427+
log.Info(string(hyperv1.DataPlaneConnectionAvailable) + " updated")
1428+
return nil
1429+
}
1430+
1431+
condition := &metav1.Condition{
1432+
Type: string(hyperv1.DataPlaneConnectionAvailable),
1433+
Status: metav1.ConditionFalse, // False by default
1434+
}
1435+
totalNodes, err := util.CountAvailableNodes(ctx, r.client)
1436+
if err != nil {
1437+
condition.Status = metav1.ConditionUnknown
1438+
condition.Reason = hyperv1.ReconcileErrorReason
1439+
condition.Message = "Unable to count worker nodes: " + err.Error()
1440+
return patchHCPWithCondition(hcp, condition)
1441+
}
1442+
if totalNodes == 0 {
1443+
condition.Status = metav1.ConditionUnknown
1444+
condition.Reason = hyperv1.DataPlaneConnectionNoWorkerNodesAvailableReason
1445+
condition.Message = "No worker nodes available"
1446+
return patchHCPWithCondition(hcp, condition)
1447+
}
1448+
var podList corev1.PodList
1449+
if err := r.uncachedClient.List(ctx, &podList,
1450+
client.MatchingLabels{"app": "konnectivity-agent"}, client.InNamespace("kube-system")); err != nil {
1451+
condition.Reason = hyperv1.ReconciliationErrorReason
1452+
condition.Message = "Couldn't list konnectivity-agent PODs in kube-system namespace: " + err.Error()
1453+
return patchHCPWithCondition(hcp, condition)
1454+
}
1455+
1456+
logsFound := false
1457+
runningPodsFound := false
1458+
for _, pod := range podList.Items {
1459+
if pod.Status.Phase != corev1.PodRunning {
1460+
continue
1461+
}
1462+
runningPodsFound = true
1463+
data, err := r.GetPodLogs(ctx, r.clientSet, pod.Namespace, pod.Name, "konnectivity-agent")
1464+
if err != nil {
1465+
log.Error(err,
1466+
fmt.Sprintf("failed to get logs for konnectivity-agent pod %s/%s", pod.Namespace, pod.Name))
1467+
continue
1468+
}
1469+
if len(data) > 0 {
1470+
logsFound = true
1471+
break
1472+
}
1473+
}
1474+
if !runningPodsFound {
1475+
condition.Reason = hyperv1.DataPlaneConnectionNoKonnectivityAgentPodsNotFoundReason
1476+
condition.Message = "Couldn't find any konnectivity-agent running in data plane"
1477+
} else {
1478+
if !logsFound {
1479+
condition.Reason = hyperv1.DataPlaneConnectionNoKonnectivityAgentPodsNotFoundReason
1480+
condition.Message = "failed to read konnectivity-agent logs from data plane"
1481+
} else {
1482+
condition.Status = metav1.ConditionTrue
1483+
condition.Reason = hyperv1.AsExpectedReason
1484+
condition.Message = hyperv1.AllIsWellMessage
1485+
}
1486+
}
1487+
1488+
return patchHCPWithCondition(hcp, condition)
1489+
}
1490+
13921491
func (r *reconciler) reconcileOpenshiftAPIServerAPIServices(ctx context.Context, hcp *hyperv1.HostedControlPlane) error {
13931492
rootCA := cpomanifests.RootCASecret(hcp.Namespace)
13941493
if err := r.cpClient.Get(ctx, client.ObjectKeyFromObject(rootCA), rootCA); err != nil {

0 commit comments

Comments
 (0)