Skip to content

Commit 3b3d8d9

Browse files
authored
test: improve e2e logs (#7469)
1 parent 0d980db commit 3b3d8d9

File tree

6 files changed

+37
-36
lines changed

6 files changed

+37
-36
lines changed

e2e/cluster.go

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,15 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm
279279
fallthrough
280280
case "Failed":
281281
logf(ctx, "echo \"##vso[task.logissue type=warning;]Cluster %s in Failed state\"", clusterName)
282-
derr := deleteCluster(ctx, clusterName, resourceGroupName)
283-
if derr != nil {
284-
return nil, derr
282+
if err := deleteCluster(ctx, clusterName, resourceGroupName); err != nil {
283+
return nil, err
284+
}
285+
// Wait for Azure to confirm cluster is fully deleted before allowing recreation.
286+
// This prevents "Reconcile managed identity credential failed" errors where Azure's
287+
// backend still has stale references to the old cluster during the new cluster's
288+
// identity reconciliation process.
289+
if err := waitForClusterDeletion(ctx, clusterName, resourceGroupName); err != nil {
290+
return nil, fmt.Errorf("failed waiting for cluster deletion: %w", err)
285291
}
286292
return nil, nil
287293
default:
@@ -316,6 +322,20 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e
316322
return nil
317323
}
318324

325+
func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName string) error {
326+
return wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) {
327+
_, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil)
328+
if err != nil {
329+
var azErr *azcore.ResponseError
330+
if errors.As(err, &azErr) && azErr.StatusCode == 404 {
331+
return true, nil // Cluster is gone
332+
}
333+
return false, fmt.Errorf("unexpected error checking cluster: %w", err)
334+
}
335+
return false, nil // Still exists, keep polling
336+
})
337+
}
338+
319339
func waitUntilClusterReady(ctx context.Context, name, location string) (*armcontainerservice.ManagedCluster, error) {
320340
var cluster armcontainerservice.ManagedClustersClientGetResponse
321341
err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
@@ -349,7 +369,7 @@ func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (boo
349369
}
350370

351371
func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) {
352-
logf(ctx, "creating or updating cluster %s in rg %s", *cluster.Name, *cluster.Location)
372+
logf(ctx, "creating or updating cluster %s in location %s", *cluster.Name, *cluster.Location)
353373
// Note, it seems like the operation still can start a trigger a new operation even if nothing has changes
354374
pollerResp, err := config.Azure.AKS.BeginCreateOrUpdate(
355375
ctx,

e2e/config/azure.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,7 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi
550550
err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location)
551551
elapsed := time.Since(start) // Calculate the elapsed time
552552

553-
toolkit.LogDuration(ctx, elapsed, 3*time.Minute, fmt.Sprintf("Replication took: %s (%s)", toolkit.FormatDuration(elapsed), *version.ID))
553+
toolkit.LogDuration(ctx, elapsed, 3*time.Minute, fmt.Sprintf("Replication took: %s (%s)", elapsed, *version.ID))
554554

555555
return err
556556
}
@@ -569,6 +569,7 @@ func (a *AzureClient) waitForVersionOperationCompletion(ctx context.Context, ima
569569
}
570570

571571
// Use the standard wait.PollUntilContextTimeout helper used throughout the codebase
572+
var lastLoggedState armcompute.GalleryProvisioningState
572573
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
573574
// Get the latest version state using the existing client
574575
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, *version.Name, nil)
@@ -578,7 +579,11 @@ func (a *AzureClient) waitForVersionOperationCompletion(ctx context.Context, ima
578579
}
579580

580581
currentState := *resp.Properties.ProvisioningState
581-
logf(ctx, "Image version %s current state: %s", *version.ID, currentState)
582+
// Only log if state has changed
583+
if currentState != lastLoggedState {
584+
logf(ctx, "Image version %s current state: %s", *version.ID, currentState)
585+
lastLoggedState = currentState
586+
}
582587

583588
// Check if operation completed
584589
if currentState != armcompute.GalleryProvisioningStateUpdating {

e2e/kube.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ func getClusterKubeClient(ctx context.Context, resourceGroupName, clusterName st
7373
}
7474

7575
func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) {
76-
logf(ctx, "waiting for pod %s %s in %q namespace to be ready", labelSelector, fieldSelector, namespace)
77-
7876
var pod *corev1.Pod
7977

8078
err := wait.PollUntilContextTimeout(ctx, time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
@@ -122,7 +120,6 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string,
122120
// Check if the pod is ready
123121
for _, cond := range pod.Status.Conditions {
124122
if cond.Type == "Ready" && cond.Status == "True" {
125-
logf(ctx, "pod %s is ready", pod.Name)
126123
return true, nil
127124
}
128125
}

e2e/test_helpers.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ func prepareAKSNode(ctx context.Context, s *Scenario) *ScenarioVM {
272272
scenarioVM.KubeName = s.Runtime.Cluster.Kube.WaitUntilNodeReady(ctx, s.T, s.Runtime.VMSSName)
273273
readyElapse := time.Since(vmssCreatedAt) // Calculate the elapsed time
274274
totalElapse := time.Since(start)
275-
toolkit.LogDuration(ctx, totalElapse, 3*time.Minute, fmt.Sprintf("Node %s took %s to be created and %s to be ready", s.Runtime.VMSSName, toolkit.FormatDuration(creationElapse), toolkit.FormatDuration(readyElapse)))
275+
toolkit.LogDuration(ctx, totalElapse, 3*time.Minute, fmt.Sprintf("Node %s took %s to be created and %s to be ready", s.Runtime.VMSSName, creationElapse, readyElapse))
276276
}
277277

278278
return scenarioVM
@@ -517,7 +517,7 @@ func RunCommand(ctx context.Context, s *Scenario, command string) (armcompute.Ru
517517
start := time.Now()
518518
defer func() {
519519
elapsed := time.Since(start)
520-
logf(ctx, "Command %q took %s", command, toolkit.FormatDuration(elapsed))
520+
logf(ctx, "Command %q took %s", command, elapsed)
521521
}()
522522

523523
runPoller, err := config.Azure.VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{
@@ -577,7 +577,7 @@ func CreateImage(ctx context.Context, s *Scenario) *config.Image {
577577
func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version string, diskResourceID string) *config.Image {
578578
startTime := time.Now()
579579
defer func() {
580-
s.T.Logf("Created SIG image version %s from disk %s in %s", version, diskResourceID, toolkit.FormatDuration(time.Since(startTime)))
580+
s.T.Logf("Created SIG image version %s from disk %s in %s", version, diskResourceID, time.Since(startTime))
581581
}()
582582
rg := config.ResourceGroupName(s.Location)
583583
gallery, err := CachedCreateGallery(ctx, CreateGalleryRequest{
@@ -683,7 +683,7 @@ func validateSSHConnectivity(ctx context.Context, s *Scenario) error {
683683
err := attemptSSHConnection(ctx, s)
684684
if err == nil {
685685
elapsed := time.Since(startTime)
686-
s.T.Logf("SSH connectivity established after %s", toolkit.FormatDuration(elapsed))
686+
s.T.Logf("SSH connectivity established after %s", elapsed)
687687
return true, nil
688688
}
689689

@@ -712,7 +712,7 @@ func validateSSHConnectivity(ctx context.Context, s *Scenario) error {
712712
// If we timed out while retrying reboot-related errors, provide a better error message
713713
if err != nil && lastSSHError != nil {
714714
elapsed := time.Since(startTime)
715-
return fmt.Errorf("SSH connection failed after waiting %s for node to reboot and come back up. Last SSH error: %w", toolkit.FormatDuration(elapsed), lastSSHError)
715+
return fmt.Errorf("SSH connection failed after waiting %s for node to reboot and come back up. Last SSH error: %w", elapsed, lastSSHError)
716716
}
717717

718718
return err

e2e/toolkit/strings.go

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,9 @@ package toolkit
33
import (
44
"context"
55
"strconv"
6-
"strings"
76
"time"
8-
9-
"k8s.io/apimachinery/pkg/util/duration"
107
)
118

12-
func StrToBoolMap(str string) map[string]bool {
13-
str = strings.ReplaceAll(str, " ", "")
14-
if str == "" {
15-
return nil
16-
}
17-
parts := strings.SplitN(str, ",", -1)
18-
m := make(map[string]bool, len(parts))
19-
for _, p := range parts {
20-
m[p] = true
21-
}
22-
return m
23-
}
24-
259
func StrToInt32(s string) int32 {
2610
i, err := strconv.ParseInt(s, 10, 32)
2711
if err != nil {
@@ -32,12 +16,8 @@ func StrToInt32(s string) int32 {
3216

3317
func LogDuration(ctx context.Context, duration time.Duration, warningDuration time.Duration, message string) {
3418
if duration > warningDuration {
35-
Logf(ctx, "##vso[task.logissue type=warning;] %s", message)
19+
Logf(ctx, "⚠️ ##vso[task.logissue type=warning;] %s", message)
3620
} else {
3721
Log(ctx, message)
3822
}
3923
}
40-
41-
func FormatDuration(length time.Duration) string {
42-
return duration.ShortHumanDuration(length)
43-
}

e2e/vmss.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,6 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) {
639639
s.T.Logf("##vso[task.logissue type=warning;]Storage account %s (%s) in Azure portal: %s", config.Config.BlobStorageAccount(), blobPrefix, azurePortalURL)
640640

641641
runCommandTimeout := int32((20 * time.Minute).Seconds())
642-
s.T.Logf("run command timeout: %d", runCommandTimeout)
643642

644643
pollerResp, err := client.BeginCreateOrUpdate(
645644
ctx,

0 commit comments

Comments
 (0)