Skip to content

Commit 24262e7

Browse files
committed
fix: introduce 5 second timeout for each Talos client call
This should fix the issue when the controller gets stuck trying to do some requests to Talos nodes. Signed-off-by: Artem Chernyshev <[email protected]>
1 parent b0a8ce2 commit 24262e7

File tree

3 files changed

+25
-0
lines changed

3 files changed

+25
-0
lines changed

controllers/etcd.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"context"
99
"fmt"
1010
"strings"
11+
"time"
1112

1213
controlplanev1 "github.com/siderolabs/cluster-api-control-plane-provider-talos/api/v1alpha3"
1314
"github.com/siderolabs/talos/pkg/machinery/api/machine"
@@ -17,6 +18,10 @@ import (
1718
)
1819

1920
func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error {
21+
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
22+
23+
defer cancel()
24+
2025
machines := []clusterv1.Machine{}
2126

2227
for _, machine := range ownedMachines {
@@ -98,6 +103,10 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *
98103
// gracefulEtcdLeave removes a given machine from the etcd cluster by forfeiting leadership
99104
// and issuing a "leave" request from the machine itself.
100105
func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, machineToLeave clusterv1.Machine) error {
106+
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
107+
108+
defer cancel()
109+
101110
r.Log.Info("verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name)
102111

103112
svcs, err := c.ServiceInfo(ctx, "etcd")
@@ -129,6 +138,10 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
129138
// forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
130139
// This is used in times when the machine was deleted out from under us.
131140
func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error {
141+
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
142+
143+
defer cancel()
144+
132145
r.Log.Info("removing etcd member", "memberName", memberName)
133146

134147
return c.EtcdRemoveMember(
@@ -142,6 +155,10 @@ func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *tal
142155
// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
143156
// It uses the first controlplane node returned as the etcd endpoint
144157
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string) error {
158+
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
159+
160+
defer cancel()
161+
145162
machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
146163
if err != nil {
147164
return err

controllers/health.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ func (e *errServiceUnhealthy) Error() string {
2929
}
3030

3131
func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
32+
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
33+
34+
defer cancel()
35+
3236
client, err := r.talosconfigForMachines(ctx, tcp, machines...)
3337
if err != nil {
3438
return err

controllers/taloscontrolplane_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,10 @@ func (r *TalosControlPlaneReconciler) bootControlPlane(ctx context.Context, clus
408408
}
409409

410410
func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
411+
ctx, cancel := context.WithTimeout(ctx, time.Second*5)
412+
413+
defer cancel()
414+
411415
c, err := r.talosconfigForMachines(ctx, tcp, machines...)
412416
if err != nil {
413417
return err

0 commit comments

Comments
 (0)