Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions src/installer/installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,24 @@ func (i *installer) InstallNode() error {
}
}

if isBootstrap || i.ControlPlaneCount == 1 {
// Wait for MachineConfig annotations on all master nodes to be consistent
// before rebooting the bootstrap. This prevents the stale annotation deadlock
// where nodes point to non-existent MachineConfigs.
var kc k8s_client.K8SClient

kc, err = i.kcBuilder(KubeconfigPath, i.log)
if err != nil {
i.log.WithError(err).Error("failed to get K8s client")

return err
}

if err = i.waitForMCAnnotationsConsistent(ctx, kc); err != nil {
return fmt.Errorf("failed to wait for MachineConfig annotations to be consistent: %w", err)
}
}

//upload host logs and report log status before reboot
i.log.Infof("Uploading logs and reporting status before rebooting the node %s for cluster %s", i.Config.HostID, i.Config.ClusterID)
i.inventoryClient.HostLogProgressReport(ctx, i.Config.InfraEnvID, i.Config.HostID, models.LogsStateRequested)
Expand Down Expand Up @@ -896,6 +914,76 @@ func (i *installer) waitForNodes(ctx context.Context, minNodes int, role string,
}
}

const (
mcCurrentConfigAnnotation = "machineconfiguration.openshift.io/currentConfig"
mcDesiredConfigAnnotation = "machineconfiguration.openshift.io/desiredConfig"
mcStateAnnotation = "machineconfiguration.openshift.io/state"
)

// waitForMCAnnotationsConsistent waits for all master nodes to have MachineConfig annotations
// that reference existing MachineConfig objects. This prevents the bootstrap from rebooting
// while nodes have stale annotations pointing to non-existent MachineConfigs.
func (i *installer) waitForMCAnnotationsConsistent(ctx context.Context, kc k8s_client.K8SClient) error {
i.log.Info("Waiting for MachineConfig annotations to be consistent on all master nodes")

return utils.WaitForPredicateWithContext(ctx, waitForeverTimeout, generalWaitInterval, func() bool {
nodes, err := kc.ListNodesByRole("master")
if err != nil {
i.log.WithError(err).Warn("Failed to list master nodes")

return false
}

if len(nodes.Items) == 0 {
i.log.Infof("No master nodes found, waiting...")

return false
}

var expectedDesiredConfig string
for _, node := range nodes.Items {
currentConfig := node.Annotations[mcCurrentConfigAnnotation]
desiredConfig := node.Annotations[mcDesiredConfigAnnotation]
state := node.Annotations[mcStateAnnotation]

// Skip if annotations are not set yet
if currentConfig == "" || desiredConfig == "" {
i.log.Infof("Node %s has no MC annotations yet, waiting...", node.Name)
return false
}

// Check if currentConfig exists
if _, err := kc.GetMachineConfig(ctx, currentConfig); err != nil {
i.log.Warnf("Node %s has currentConfig %s which does not exist (state=%s), waiting...",
node.Name, currentConfig, state)
return false
}

// Check if desiredConfig exists
if _, err := kc.GetMachineConfig(ctx, desiredConfig); err != nil {
i.log.Warnf("Node %s has desiredConfig %s which does not exist (state=%s), waiting...",
node.Name, desiredConfig, state)
return false
}

// Require all master nodes to target the same desiredConfig (pool is converged)
if expectedDesiredConfig == "" {
expectedDesiredConfig = desiredConfig
}

if desiredConfig != expectedDesiredConfig {
i.log.Warnf("Node %s has desiredConfig %s, expected %s (state=%s), waiting...",
node.Name, desiredConfig, expectedDesiredConfig, state)
return false
}
}

i.log.Infof("All master nodes have consistent MachineConfig annotations (desired=%s)", expectedDesiredConfig)

return true
})
}

func (i *installer) getInventoryHostsMap(hostsMap map[string]inventory_client.HostData) (map[string]inventory_client.HostData, error) {
var err error
if hostsMap == nil {
Expand Down
37 changes: 37 additions & 0 deletions src/installer/installer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,29 @@ var _ = Describe("installer HostRoleMaster role", func() {
}, nil)
}

// waitForMCAnnotationsConsistentSuccess sets expectations for the MC annotations
// check that runs after waitForWorkers (bootstrap only). Call after WaitMasterNodesSucccess
// so that ListNodesByRole("master") expectations are consumed in the right order.
waitForMCAnnotationsConsistentSuccess := func() {
masterNodesWithMCAnnotations := &v1.NodeList{
Items: []v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "master-0",
Annotations: map[string]string{
"machineconfiguration.openshift.io/currentConfig": "rendered-master-12345",
"machineconfiguration.openshift.io/desiredConfig": "rendered-master-12345",
"machineconfiguration.openshift.io/state": "Done",
},
},
},
},
}

mockk8sclient.EXPECT().ListNodesByRole("master").Return(masterNodesWithMCAnnotations, nil).MinTimes(1)
mockk8sclient.EXPECT().GetMachineConfig(gomock.Any(), gomock.Any()).Return(&mcfgv1.MachineConfig{}, nil).MinTimes(1)
}

setupInvoker := func(invoker ...string) {
i := "assisted-service"
if len(invoker) > 0 {
Expand Down Expand Up @@ -401,6 +424,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
waitForETCDBootstrapSuccess()
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForMCAnnotationsConsistentSuccess()
waitForControllerSuccessfully(conf.ClusterID)
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
Expand Down Expand Up @@ -438,6 +462,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -476,6 +501,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -514,6 +540,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -561,6 +588,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -597,6 +625,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -633,6 +662,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -670,6 +700,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -712,6 +743,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForWorkersSuccessfully()
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -754,6 +786,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForWorkersSuccessfully()
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -834,6 +867,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -871,6 +905,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -988,6 +1023,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
bootstrapETCDStatusSuccess()
resolvConfSuccess()
waitForControllerSuccessfully(conf.ClusterID)
waitForMCAnnotationsConsistentSuccess()
//HostRoleMaster flow:
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
writeToDiskSuccess(gomock.Any())
Expand Down Expand Up @@ -1450,6 +1486,7 @@ var _ = Describe("installer HostRoleMaster role", func() {
//HostRoleMaster flow:
verifySingleNodeMasterIgnitionSuccess()
singleNodeMergeIgnitionSuccess()
waitForMCAnnotationsConsistentSuccess()
downloadHostIgnitionSuccess(infraEnvId, hostId, "master-host-id.ign")
mockops.EXPECT().WriteImageToDisk(gomock.Any(), singleNodeMasterIgnitionPath, device, nil).Return(nil).Times(1)
setBootOrderSuccess()
Expand Down
60 changes: 36 additions & 24 deletions src/k8s_client/k8s_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ import (
certificatesClient "k8s.io/client-go/kubernetes/typed/certificates/v1"
"k8s.io/client-go/tools/clientcmd"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
runtimeconfig "sigs.k8s.io/controller-runtime/pkg/client/config"

"github.com/openshift/assisted-installer/src/ops"
"github.com/openshift/assisted-installer/src/utils"
Expand Down Expand Up @@ -89,6 +88,7 @@ type K8SClient interface {
IsClusterCapabilityEnabled(configv1.ClusterVersionCapability) (bool, error)
UntaintNode(name string) error
PatchMachineConfigPoolPaused(pause bool, mcpName string) error
GetMachineConfig(ctx context.Context, name string) (*mcfgv1.MachineConfig, error)
}

type K8SClientBuilder func(configPath string, logger logrus.FieldLogger) (K8SClient, error)
Expand Down Expand Up @@ -133,32 +133,33 @@ func NewK8SClient(configPath string, logger logrus.FieldLogger) (K8SClient, erro
if err != nil {
return &k8sClient{}, errors.Wrap(err, "creating openshift config client")
}
var runtimeClient runtimeclient.Client
if configPath == "" {
scheme := runtime.NewScheme()
err = clientgoscheme.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add scheme to")
}

err = metal3v1alpha1.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add BMH scheme")
}
err = machinev1beta1.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add Machine scheme")
}
// Always create runtime client with full scheme support
scheme := runtime.NewScheme()
err = clientgoscheme.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add scheme to")
}

err = mcfgv1.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add MCP scheme")
}
err = metal3v1alpha1.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add BMH scheme")
}

runtimeClient, err = runtimeclient.New(runtimeconfig.GetConfigOrDie(), runtimeclient.Options{Scheme: scheme})
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to create runtime client")
}
err = machinev1beta1.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add Machine scheme")
}

err = mcfgv1.AddToScheme(scheme)
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to add MCP scheme")
}

// Use the config we already loaded (works with both in-cluster and kubeconfig)
runtimeClient, err := runtimeclient.New(config, runtimeclient.Options{Scheme: scheme})
if err != nil {
return &k8sClient{}, errors.Wrap(err, "failed to create runtime client")
}

return &k8sClient{logger, client, ocClient, csvClient, runtimeClient, csrClient,
Expand Down Expand Up @@ -713,3 +714,14 @@ func (c *k8sClient) PatchMachineConfigPoolPaused(pause bool, mcpName string) err
c.log.Infof("Setting pause MCP %s to %t", mcpName, pause)
return c.runtimeClient.Patch(context.TODO(), mcp, runtimeclient.RawPatch(types.MergePatchType, pausePatch))
}

func (c *k8sClient) GetMachineConfig(ctx context.Context, name string) (*mcfgv1.MachineConfig, error) {
mc := &mcfgv1.MachineConfig{}

err := c.runtimeClient.Get(ctx, types.NamespacedName{Name: name}, mc)
if err != nil {
return nil, fmt.Errorf("failed to get MachineConfig %s: %w", name, err)
}

return mc, nil
}
Loading