Skip to content

Commit 52e2540

Browse files
Merge pull request #8376 from rwsu/AGENT-903
AGENT-903: monitor-add-nodes should only show CSRs matching node
2 parents 48241a0 + 5ad9955 commit 52e2540

File tree

11 files changed

+742
-146
lines changed

11 files changed

+742
-146
lines changed

cmd/node-joiner/main.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,22 @@ func main() {
3434
Use: "monitor-add-nodes",
3535
Short: "Monitors the configured nodes while they are joining an existing cluster",
3636
RunE: func(cmd *cobra.Command, args []string) error {
37-
return nodejoiner.NewMonitorAddNodesCommand("")
37+
dir, err := cmd.Flags().GetString("dir")
38+
if err != nil {
39+
return err
40+
}
41+
42+
kubeConfig, err := cmd.Flags().GetString("kubeconfig")
43+
if err != nil {
44+
return err
45+
}
46+
47+
ips := args
48+
logrus.Infof("Monitoring IPs: %v", ips)
49+
if len(ips) == 0 {
50+
logrus.Fatal("At least one IP address must be specified")
51+
}
52+
return nodejoiner.NewMonitorAddNodesCommand(dir, kubeConfig, ips)
3853
},
3954
}
4055

@@ -74,8 +89,9 @@ func runRootCmd(cmd *cobra.Command, args []string) {
7489
// Overriding it here allows the same check to be done, but against the
7590
// hook's output instead of the logger's output.
7691
ForceColors: terminal.IsTerminal(int(os.Stderr.Fd())),
77-
DisableTimestamp: true,
7892
DisableLevelTruncation: true,
93+
DisableTimestamp: false,
94+
FullTimestamp: true,
7995
DisableQuote: true,
8096
}))
8197

cmd/openshift-install/agent/waitfor.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ package agent
22

33
import (
44
"context"
5+
"path/filepath"
56

67
"github.com/pkg/errors"
78
"github.com/sirupsen/logrus"
89
"github.com/spf13/cobra"
910

1011
"github.com/openshift/installer/cmd/openshift-install/command"
1112
agentpkg "github.com/openshift/installer/pkg/agent"
13+
"github.com/openshift/installer/pkg/asset/agent/workflow"
1214
)
1315

1416
const (
@@ -62,8 +64,15 @@ func newWaitForBootstrapCompleteCmd() *cobra.Command {
6264
logrus.Fatal("No cluster installation directory found")
6365
}
6466

67+
kubeconfigPath := filepath.Join(assetDir, "auth", "kubeconfig")
68+
69+
rendezvousIP, sshKey, err := agentpkg.FindRendezvouIPAndSSHKeyFromAssetStore(assetDir)
70+
if err != nil {
71+
logrus.Fatal(err)
72+
}
73+
6574
ctx := context.Background()
66-
cluster, err := agentpkg.NewCluster(ctx, assetDir)
75+
cluster, err := agentpkg.NewCluster(ctx, assetDir, rendezvousIP, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeInstall)
6776
if err != nil {
6877
logrus.Exit(exitCodeBootstrapFailed)
6978
}
@@ -90,8 +99,15 @@ func newWaitForInstallCompleteCmd() *cobra.Command {
9099
logrus.Fatal("No cluster installation directory found")
91100
}
92101

102+
kubeconfigPath := filepath.Join(assetDir, "auth", "kubeconfig")
103+
104+
rendezvousIP, sshKey, err := agentpkg.FindRendezvouIPAndSSHKeyFromAssetStore(assetDir)
105+
if err != nil {
106+
logrus.Fatal(err)
107+
}
108+
93109
ctx := context.Background()
94-
cluster, err := agentpkg.NewCluster(ctx, assetDir)
110+
cluster, err := agentpkg.NewCluster(ctx, assetDir, rendezvousIP, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeInstall)
95111
if err != nil {
96112
logrus.Exit(exitCodeBootstrapFailed)
97113
}

pkg/agent/cluster.go

Lines changed: 119 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package agent
22

33
import (
44
"context"
5+
"fmt"
56
"net"
67
"os"
78
"path/filepath"
@@ -14,6 +15,7 @@ import (
1415

1516
"github.com/openshift/assisted-service/client/installer"
1617
"github.com/openshift/assisted-service/models"
18+
"github.com/openshift/installer/pkg/asset/agent/workflow"
1719
"github.com/openshift/installer/pkg/gather/ssh"
1820
)
1921

@@ -27,6 +29,7 @@ type Cluster struct {
2729
clusterID *strfmt.UUID
2830
clusterInfraEnvID *strfmt.UUID
2931
installHistory *clusterInstallStatusHistory
32+
workflow workflow.AgentWorkflowType
3033
}
3134

3235
type clientSet struct {
@@ -63,21 +66,20 @@ type clusterInstallStatusHistory struct {
6366
}
6467

6568
// NewCluster initializes a Cluster object
66-
func NewCluster(ctx context.Context, assetDir string) (*Cluster, error) {
67-
69+
func NewCluster(ctx context.Context, assetDir, rendezvousIP, kubeconfigPath, sshKey string, workflowType workflow.AgentWorkflowType) (*Cluster, error) {
6870
czero := &Cluster{}
6971
capi := &clientSet{}
7072

71-
restclient, err := NewNodeZeroRestClient(ctx, assetDir)
73+
restclient, err := NewNodeZeroRestClient(ctx, rendezvousIP, sshKey)
7274
if err != nil {
7375
logrus.Fatal(err)
7476
}
75-
kubeclient, err := NewClusterKubeAPIClient(ctx, assetDir)
77+
kubeclient, err := NewClusterKubeAPIClient(ctx, kubeconfigPath)
7678
if err != nil {
7779
logrus.Fatal(err)
7880
}
7981

80-
ocpclient, err := NewClusterOpenShiftAPIClient(ctx, assetDir)
82+
ocpclient, err := NewClusterOpenShiftAPIClient(ctx, kubeconfigPath)
8183
if err != nil {
8284
logrus.Fatal(err)
8385
}
@@ -108,6 +110,7 @@ func NewCluster(ctx context.Context, assetDir string) (*Cluster, error) {
108110

109111
czero.Ctx = ctx
110112
czero.API = capi
113+
czero.workflow = workflowType
111114
czero.clusterID = nil
112115
czero.clusterInfraEnvID = nil
113116
czero.assetDir = assetDir
@@ -167,7 +170,6 @@ func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {
167170
if configmap {
168171
logrus.Info("Bootstrap configMap status is complete")
169172
czero.installHistory.ClusterBootstrapComplete = true
170-
return true, false, nil
171173
}
172174
if err != nil {
173175
logrus.Debug(err)
@@ -176,105 +178,133 @@ func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {
176178

177179
// Agent Rest API is available
178180
if agentRestAPILive {
179-
180-
// First time we see the agent Rest API
181-
if !czero.installHistory.RestAPISeen {
182-
logrus.Debug("Agent Rest API Initialized")
183-
czero.installHistory.RestAPISeen = true
184-
czero.installHistory.NotReadyTime = time.Now()
181+
exitOnErr, err := czero.MonitorStatusFromAssistedService()
182+
if err != nil {
183+
return false, exitOnErr, err
185184
}
185+
}
186186

187-
// Lazy loading of the clusterID and clusterInfraEnvID
188-
if czero.clusterID == nil {
189-
clusterID, err := czero.API.Rest.getClusterID()
190-
if err != nil {
191-
return false, false, errors.Wrap(err, "Unable to retrieve clusterID from Agent Rest API")
192-
}
193-
czero.clusterID = clusterID
194-
}
187+
// cluster bootstrap is not complete
188+
return false, false, nil
189+
}
195190

196-
if czero.clusterInfraEnvID == nil {
197-
clusterInfraEnvID, err := czero.API.Rest.getClusterInfraEnvID()
198-
if err != nil {
199-
return false, false, errors.Wrap(err, "Unable to retrieve clusterInfraEnvID from Agent Rest API")
200-
}
201-
czero.clusterInfraEnvID = clusterInfraEnvID
202-
}
191+
// MonitorStatusFromAssistedService (exit-on-error, returned-error)
192+
// checks if the Assisted Service API is up, and both cluster and
193+
// infraenv have been registered.
194+
//
195+
// After those preconditions are met,
196+
// it then reports on the host validation status and overall cluster
197+
// status and updates the cluster's install history.
198+
//
199+
// After cluster or host installation has started, new events from
200+
// the Assisted Service API are also logged and updated to the cluster's
201+
// install history.
202+
func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) {
203+
resource := "cluster"
204+
logPrefix := ""
205+
if czero.workflow == workflow.AgentWorkflowTypeAddNodes {
206+
resource = "host"
207+
logPrefix = fmt.Sprintf("Node %s: ", czero.API.Rest.NodeZeroIP)
208+
}
203209

204-
// Getting cluster metadata from Agent Rest API
205-
clusterMetadata, err := czero.GetClusterRestAPIMetadata()
210+
// First time we see the agent Rest API
211+
if !czero.installHistory.RestAPISeen {
212+
logrus.Debugf("%sAgent Rest API Initialized", logPrefix)
213+
czero.installHistory.RestAPISeen = true
214+
czero.installHistory.NotReadyTime = time.Now()
215+
}
216+
217+
// Lazy loading of the clusterID and clusterInfraEnvID
218+
if czero.clusterID == nil {
219+
clusterID, err := czero.API.Rest.getClusterID()
206220
if err != nil {
207-
return false, false, errors.Wrap(err, "Unable to retrieve cluster metadata from Agent Rest API")
221+
return false, errors.Wrap(err, "Unable to retrieve clusterID from Agent Rest API")
208222
}
223+
czero.clusterID = clusterID
224+
}
209225

210-
if clusterMetadata == nil {
211-
return false, false, errors.New("cluster metadata returned nil from Agent Rest API")
226+
if czero.clusterInfraEnvID == nil {
227+
clusterInfraEnvID, err := czero.API.Rest.getClusterInfraEnvID()
228+
if err != nil {
229+
return false, errors.Wrap(err, "Unable to retrieve clusterInfraEnvID from Agent Rest API")
212230
}
231+
czero.clusterInfraEnvID = clusterInfraEnvID
232+
}
233+
234+
// Getting cluster metadata from Agent Rest API
235+
clusterMetadata, err := czero.GetClusterRestAPIMetadata()
236+
if err != nil {
237+
return false, errors.Wrap(err, "Unable to retrieve cluster metadata from Agent Rest API")
238+
}
239+
240+
if clusterMetadata == nil {
241+
return false, errors.New("cluster metadata returned nil from Agent Rest API")
242+
}
213243

214-
czero.PrintInstallStatus(clusterMetadata)
244+
czero.PrintInstallStatus(clusterMetadata)
215245

216-
// If status indicates pending action, log host info to help pinpoint what is missing
217-
if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) &&
218-
(*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) {
219-
for _, host := range clusterMetadata.Hosts {
220-
if *host.Status == models.ClusterStatusInstallingPendingUserAction {
246+
// If status indicates pending action, log host info to help pinpoint what is missing
247+
if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) &&
248+
(*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) {
249+
for _, host := range clusterMetadata.Hosts {
250+
if *host.Status == models.ClusterStatusInstallingPendingUserAction {
251+
if logPrefix != "" {
252+
logrus.Warningf("%s%s %s", logPrefix, host.RequestedHostname, *host.StatusInfo)
253+
} else {
221254
logrus.Warningf("Host %s %s", host.RequestedHostname, *host.StatusInfo)
222255
}
223256
}
224257
}
258+
}
225259

226-
if *clusterMetadata.Status == models.ClusterStatusReady {
227-
stuck, err := czero.IsClusterStuckInReady()
228-
if err != nil {
229-
return false, stuck, err
230-
}
231-
} else {
232-
czero.installHistory.NotReadyTime = time.Now()
260+
if *clusterMetadata.Status == models.ClusterStatusReady {
261+
stuck, err := czero.IsClusterStuckInReady()
262+
if err != nil {
263+
return stuck, err
233264
}
265+
} else {
266+
czero.installHistory.NotReadyTime = time.Now()
267+
}
234268

235-
czero.installHistory.RestAPIPreviousClusterStatus = *clusterMetadata.Status
269+
czero.installHistory.RestAPIPreviousClusterStatus = *clusterMetadata.Status
236270

237-
installing, _ := czero.IsInstalling(*clusterMetadata.Status)
238-
if !installing {
239-
errored, _ := czero.HasErrored(*clusterMetadata.Status)
240-
if errored {
241-
return false, false, errors.New("cluster has stopped installing... working to recover installation")
242-
} else if *clusterMetadata.Status == models.ClusterStatusCancelled {
243-
return false, true, errors.New("cluster installation was cancelled")
244-
}
271+
installing, _ := czero.IsInstalling(*clusterMetadata.Status)
272+
if !installing {
273+
errored, _ := czero.HasErrored(*clusterMetadata.Status)
274+
if errored {
275+
return false, fmt.Errorf("%s has stopped installing... working to recover installation", resource)
276+
} else if *clusterMetadata.Status == models.ClusterStatusCancelled {
277+
return true, fmt.Errorf("%s installation was cancelled", resource)
245278
}
279+
}
246280

247-
validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger())
248-
if validationsErr != nil {
249-
return false, false, errors.Wrap(validationsErr, "cluster host validations failed")
281+
validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger(), logPrefix)
282+
if validationsErr != nil {
283+
return false, errors.Wrap(validationsErr, "host validations failed")
250284

251-
}
285+
}
252286

253-
// Print most recent event associated with the clusterInfraEnvID
254-
eventList, err := czero.API.Rest.GetInfraEnvEvents(czero.clusterInfraEnvID)
255-
if err != nil {
256-
return false, false, errors.Wrap(err, "Unable to retrieve events about the cluster from the Agent Rest API")
257-
}
258-
if len(eventList) == 0 {
259-
// No cluster events detected from the Agent Rest API
260-
} else {
261-
mostRecentEvent := eventList[len(eventList)-1]
262-
// Don't print the same status message back to back
263-
if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage {
264-
if *mostRecentEvent.Severity == models.EventSeverityInfo {
265-
logrus.Info(*mostRecentEvent.Message)
266-
} else {
267-
logrus.Warn(*mostRecentEvent.Message)
268-
}
287+
// Print most recent event associated with the clusterInfraEnvID
288+
eventList, err := czero.API.Rest.GetInfraEnvEvents(czero.clusterInfraEnvID)
289+
if err != nil {
290+
return false, errors.Wrap(err, fmt.Sprintf("Unable to retrieve events about the %s from the Agent Rest API", resource))
291+
}
292+
if len(eventList) == 0 {
293+
// No cluster events detected from the Agent Rest API
294+
} else {
295+
mostRecentEvent := eventList[len(eventList)-1]
296+
// Don't print the same status message back to back
297+
if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage {
298+
if *mostRecentEvent.Severity == models.EventSeverityInfo {
299+
logrus.Infof("%s%s", logPrefix, *mostRecentEvent.Message)
300+
} else {
301+
logrus.Warnf("%s%s", logPrefix, *mostRecentEvent.Message)
269302
}
270-
czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message
271-
czero.installHistory.RestAPIInfraEnvEventList = eventList
272303
}
273-
304+
czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message
305+
czero.installHistory.RestAPIInfraEnvEventList = eventList
274306
}
275-
276-
// cluster bootstrap is not complete
277-
return false, false, nil
307+
return false, nil
278308
}
279309

280310
// IsInstallComplete Determine if the cluster has completed installation.
@@ -429,15 +459,12 @@ func (czero *Cluster) PrintInstallationComplete() error {
429459
}
430460

431461
// PrintInstallStatus Print a human friendly message using the models from the Agent Rest API.
432-
func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster) error {
433-
434-
friendlyStatus := humanFriendlyClusterInstallStatus(*cluster.Status)
462+
func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster) {
463+
friendlyStatus := czero.humanFriendlyClusterInstallStatus(*cluster.Status)
435464
// Don't print the same status message back to back
436465
if *cluster.Status != czero.installHistory.RestAPIPreviousClusterStatus {
437466
logrus.Info(friendlyStatus)
438467
}
439-
440-
return nil
441468
}
442469

443470
// CanSSHToNodeZero Checks if ssh to NodeZero succeeds.
@@ -453,7 +480,7 @@ func (czero *Cluster) CanSSHToNodeZero() bool {
453480
}
454481

455482
// Human friendly install status strings mapped to the Agent Rest API cluster statuses
456-
func humanFriendlyClusterInstallStatus(status string) string {
483+
func (czero *Cluster) humanFriendlyClusterInstallStatus(status string) string {
457484
clusterStoppedInstallingStates := map[string]string{
458485
models.ClusterStatusAddingHosts: "Cluster is adding hosts",
459486
models.ClusterStatusCancelled: "Cluster installation cancelled",
@@ -466,6 +493,10 @@ func humanFriendlyClusterInstallStatus(status string) string {
466493
models.ClusterStatusPreparingForInstallation: "Preparing cluster for installation",
467494
models.ClusterStatusReady: "Cluster is ready for install",
468495
}
469-
return clusterStoppedInstallingStates[status]
470-
496+
switch czero.workflow {
497+
case workflow.AgentWorkflowTypeAddNodes:
498+
return fmt.Sprintf("Node %s: %s", czero.API.Rest.NodeZeroIP, clusterStoppedInstallingStates[status])
499+
default:
500+
return clusterStoppedInstallingStates[status]
501+
}
471502
}

0 commit comments

Comments
 (0)