Skip to content

Commit efa0345

Browse files
committed
fix: fall back to old scheme of getting talsoconfig for older templates
This PR contains couple of fixes. In case if tcp has `init` nodes config: - mark controlplane as bootstrapped immediately. - populate nodes kubeconfig using workload cluster nodes info instead of using machines addresses. Signed-off-by: Artem Chernyshev <[email protected]>
1 parent 89f793e commit efa0345

File tree

4 files changed

+105
-20
lines changed

4 files changed

+105
-20
lines changed

controllers/configs.go

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@ import (
88
"context"
99
"fmt"
1010
"net"
11+
"reflect"
1112
"time"
1213

1314
cabptv1 "github.com/talos-systems/cluster-api-bootstrap-provider-talos/api/v1alpha3"
15+
controlplanev1 "github.com/talos-systems/cluster-api-control-plane-provider-talos/api/v1alpha3"
1416
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
1517
talosconfig "github.com/talos-systems/talos/pkg/machinery/client/config"
1618
corev1 "k8s.io/api/core/v1"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1720
"k8s.io/apimachinery/pkg/types"
1821
"k8s.io/client-go/kubernetes"
1922
"k8s.io/client-go/tools/clientcmd"
@@ -75,11 +78,15 @@ func (r *TalosControlPlaneReconciler) kubeconfigForCluster(ctx context.Context,
7578
}
7679

7780
// talosconfigForMachine will generate a talosconfig that uses *all* found addresses as the endpoints.
78-
func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context, machines ...clusterv1.Machine) (*talosclient.Client, error) {
81+
func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context, tcp *controlplanev1.TalosControlPlane, machines ...clusterv1.Machine) (*talosclient.Client, error) {
7982
if len(machines) == 0 {
8083
return nil, fmt.Errorf("at least one machine should be provided")
8184
}
8285

86+
if !reflect.ValueOf(tcp.Spec.ControlPlaneConfig.InitConfig).IsZero() {
87+
return r.talosconfigFromWorkloadCluster(ctx, client.ObjectKey{Namespace: tcp.GetNamespace(), Name: tcp.GetLabels()["cluster.x-k8s.io/cluster-name"]}, machines...)
88+
}
89+
8390
addrList := []string{}
8491

8592
var t *talosconfig.Config
@@ -130,3 +137,74 @@ func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context
130137

131138
return talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
132139
}
140+
141+
// talosconfigFromWorkloadCluster gets talosconfig and populates endoints using workload cluster nodes.
142+
func (r *TalosControlPlaneReconciler) talosconfigFromWorkloadCluster(ctx context.Context, cluster client.ObjectKey, machines ...clusterv1.Machine) (*talosclient.Client, error) {
143+
if len(machines) == 0 {
144+
return nil, fmt.Errorf("at least one machine should be provided")
145+
}
146+
147+
clientset, err := r.kubeconfigForCluster(ctx, cluster)
148+
if err != nil {
149+
return nil, err
150+
}
151+
152+
addrList := []string{}
153+
154+
var t *talosconfig.Config
155+
156+
for _, machine := range machines {
157+
if machine.Status.NodeRef == nil {
158+
return nil, fmt.Errorf("%q machine does not have a nodeRef", machine.Name)
159+
}
160+
161+
// grab all addresses as endpoints
162+
node, err := clientset.CoreV1().Nodes().Get(ctx, machine.Status.NodeRef.Name, metav1.GetOptions{})
163+
if err != nil {
164+
return nil, err
165+
}
166+
167+
for _, addr := range node.Status.Addresses {
168+
if addr.Type == corev1.NodeExternalIP || addr.Type == corev1.NodeInternalIP {
169+
addrList = append(addrList, addr.Address)
170+
}
171+
}
172+
173+
if len(addrList) == 0 {
174+
return nil, fmt.Errorf("no addresses were found for node %q", node.Name)
175+
}
176+
177+
if t == nil {
178+
var (
179+
cfgs cabptv1.TalosConfigList
180+
found *cabptv1.TalosConfig
181+
)
182+
183+
// find talosconfig in the machine's namespace
184+
err = r.Client.List(ctx, &cfgs, client.InNamespace(machine.Namespace))
185+
if err != nil {
186+
return nil, err
187+
}
188+
189+
for _, cfg := range cfgs.Items {
190+
for _, ref := range cfg.OwnerReferences {
191+
if ref.Kind == "Machine" && ref.Name == machine.Name {
192+
found = &cfg
193+
break
194+
}
195+
}
196+
}
197+
198+
if found == nil {
199+
return nil, fmt.Errorf("failed to find TalosConfig for %q", machine.Name)
200+
}
201+
202+
t, err = talosconfig.FromString(found.Status.TalosConfig)
203+
if err != nil {
204+
return nil, err
205+
}
206+
}
207+
}
208+
209+
return talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
210+
}

controllers/etcd.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@ import (
99
"fmt"
1010
"strings"
1111

12+
controlplanev1 "github.com/talos-systems/cluster-api-control-plane-provider-talos/api/v1alpha3"
1213
"github.com/talos-systems/talos/pkg/machinery/api/machine"
1314
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
1415
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
1516
"sigs.k8s.io/cluster-api/util"
1617
"sigs.k8s.io/controller-runtime/pkg/client"
1718
)
1819

19-
func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error {
20+
func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error {
2021
kubeclient, err := r.kubeconfigForCluster(ctx, util.ObjectKey(cluster))
2122
if err != nil {
2223
return err
@@ -32,7 +33,7 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, clust
3233
}
3334
}
3435

35-
c, err := r.talosconfigForMachines(ctx, machines...)
36+
c, err := r.talosconfigForMachines(ctx, tcp, machines...)
3637
if err != nil {
3738
return err
3839
}
@@ -148,7 +149,7 @@ func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *tal
148149

149150
// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
150151
// It uses the first controlplane node returned as the etcd endpoint
151-
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster client.ObjectKey, cpName string) error {
152+
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string) error {
152153
machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
153154
if err != nil {
154155
return err
@@ -182,7 +183,7 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster cli
182183
return fmt.Errorf("no CP machine which is not being deleted and has node ref")
183184
}
184185

185-
c, err := r.talosconfigForMachines(ctx, designatedCPMachine)
186+
c, err := r.talosconfigForMachines(ctx, tcp, designatedCPMachine)
186187
if err != nil {
187188
return err
188189
}

controllers/health.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/pkg/errors"
14+
controlplanev1 "github.com/talos-systems/cluster-api-control-plane-provider-talos/api/v1alpha3"
1415
machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
1516
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
1617
"google.golang.org/grpc/codes"
@@ -27,8 +28,8 @@ func (e *errServiceUnhealthy) Error() string {
2728
return fmt.Sprintf("Service %s is unhealthy: %s", e.service, e.reason)
2829
}
2930

30-
func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
31-
client, err := r.talosconfigForMachines(ctx, machines...)
31+
func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
32+
client, err := r.talosconfigForMachines(ctx, tcp, machines...)
3233
if err != nil {
3334
return err
3435
}
@@ -54,8 +55,8 @@ func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, clus
5455
return nil
5556
}
5657

57-
func (r *TalosControlPlaneReconciler) ensureNodesBooted(ctx context.Context, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
58-
client, err := r.talosconfigForMachines(ctx, machines...)
58+
func (r *TalosControlPlaneReconciler) ensureNodesBooted(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
59+
client, err := r.talosconfigForMachines(ctx, tcp, machines...)
5960
if err != nil {
6061
return err
6162
}

controllers/taloscontrolplane_controller.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ func newControlPlane(cluster *clusterv1.Cluster, tcp *controlplanev1.TalosContro
302302
}
303303
}
304304

305-
func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster client.ObjectKey, cpName string, machines []clusterv1.Machine) (ctrl.Result, error) {
305+
func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string, machines []clusterv1.Machine) (ctrl.Result, error) {
306306
if len(machines) == 0 {
307307
return ctrl.Result{}, fmt.Errorf("no machines found")
308308
}
@@ -359,7 +359,7 @@ func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context,
359359

360360
node := deleteMachine.Status.NodeRef
361361

362-
c, err := r.talosconfigForMachines(ctx, deleteMachine)
362+
c, err := r.talosconfigForMachines(ctx, tcp, deleteMachine)
363363
if err != nil {
364364
return ctrl.Result{RequeueAfter: 20 * time.Second}, err
365365
}
@@ -525,8 +525,8 @@ func (r *TalosControlPlaneReconciler) bootControlPlane(ctx context.Context, clus
525525
return ctrl.Result{Requeue: true}, nil
526526
}
527527

528-
func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
529-
c, err := r.talosconfigForMachines(ctx, machines...)
528+
func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
529+
c, err := r.talosconfigForMachines(ctx, tcp, machines...)
530530
if err != nil {
531531
return err
532532
}
@@ -761,11 +761,11 @@ func (r *TalosControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, c
761761
func (r *TalosControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context, cluster *clusterv1.Cluster, tcp *controlplanev1.TalosControlPlane, machines []clusterv1.Machine) (result ctrl.Result, err error) {
762762
var errs error
763763
// Audit the etcd member list to remove any nodes that no longer exist
764-
if err := r.auditEtcd(ctx, util.ObjectKey(cluster), tcp.Name); err != nil {
764+
if err := r.auditEtcd(ctx, tcp, util.ObjectKey(cluster), tcp.Name); err != nil {
765765
errs = kerrors.NewAggregate([]error{errs, err})
766766
}
767767

768-
if err := r.etcdHealthcheck(ctx, cluster, machines); err != nil {
768+
if err := r.etcdHealthcheck(ctx, tcp, cluster, machines); err != nil {
769769
conditions.MarkFalse(tcp, controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason,
770770
clusterv1.ConditionSeverityWarning, err.Error())
771771
errs = kerrors.NewAggregate([]error{errs, err})
@@ -781,7 +781,7 @@ func (r *TalosControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context,
781781
}
782782

783783
func (r *TalosControlPlaneReconciler) reconcileNodeHealth(ctx context.Context, cluster *clusterv1.Cluster, tcp *controlplanev1.TalosControlPlane, machines []clusterv1.Machine) (result ctrl.Result, err error) {
784-
if err := r.nodesHealthcheck(ctx, cluster, machines); err != nil {
784+
if err := r.nodesHealthcheck(ctx, tcp, cluster, machines); err != nil {
785785
reason := controlplanev1.ControlPlaneComponentsInspectionFailedReason
786786

787787
if errors.Is(err, &errServiceUnhealthy{}) {
@@ -851,7 +851,7 @@ func (r *TalosControlPlaneReconciler) reconcileMachines(ctx context.Context, clu
851851
return res, nil
852852
}
853853

854-
if err := r.ensureNodesBooted(ctx, cluster, machines); err != nil {
854+
if err := r.ensureNodesBooted(ctx, controlPlane.TCP, cluster, machines); err != nil {
855855
logger.Info("waiting for all nodes to finish boot sequence", "error", err)
856856

857857
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
@@ -865,7 +865,7 @@ func (r *TalosControlPlaneReconciler) reconcileMachines(ctx context.Context, clu
865865

866866
logger.Info("scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)
867867

868-
res, err = r.scaleDownControlPlane(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name, machines)
868+
res, err = r.scaleDownControlPlane(ctx, tcp, util.ObjectKey(cluster), controlPlane.TCP.Name, machines)
869869
if err != nil {
870870
if res.Requeue || res.RequeueAfter > 0 {
871871
logger.Info("failed to scale down control plane", "error", err)
@@ -876,8 +876,13 @@ func (r *TalosControlPlaneReconciler) reconcileMachines(ctx context.Context, clu
876876

877877
return res, err
878878
default:
879-
if !tcp.Status.Bootstrapped && reflect.ValueOf(tcp.Spec.ControlPlaneConfig.InitConfig).IsZero() {
880-
if err := r.bootstrapCluster(ctx, cluster, machines); err != nil {
879+
if !reflect.ValueOf(tcp.Spec.ControlPlaneConfig.InitConfig).IsZero() {
880+
tcp.Status.Bootstrapped = true
881+
conditions.MarkTrue(tcp, controlplanev1.MachinesBootstrapped)
882+
}
883+
884+
if !tcp.Status.Bootstrapped {
885+
if err := r.bootstrapCluster(ctx, tcp, cluster, machines); err != nil {
881886
conditions.MarkFalse(tcp, controlplanev1.MachinesBootstrapped, controlplanev1.WaitingForTalosBootReason, clusterv1.ConditionSeverityInfo, err.Error())
882887

883888
logger.Info("bootstrap failed, retrying in 20 seconds", "error", err)

0 commit comments

Comments
 (0)