Skip to content

Commit 32b9a5b

Browse files
[ci] capture logs of pods in the test namespace and kube-system (#7341) (#7512)
* fix: path for pod logs dump and uploading them as artifacts * fix: derive k8s namespace from random uuid * fix: dump the logs of all pods and the containers statuses into an archive for the test namespace and kube-system * fix: properly set the namespace for clusterole and clusterrole bindings in kustomize-based k8s integration tests * fix: extend k8sCheckAgentStatus to catch container restarts * test: make k8s integration tests be flaky * fix: time box all exec in pods * Revert "test: make k8s integration tests be flaky" This reverts commit e776c64. (cherry picked from commit 79bcfb7) Co-authored-by: Panos Koutsovasilis <[email protected]>
1 parent 224a485 commit 32b9a5b

File tree

3 files changed

+172
-44
lines changed

3 files changed

+172
-44
lines changed

.buildkite/bk.integration.pipeline.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ steps:
262262
artifact_paths:
263263
- build/**
264264
- build/diagnostics/**
265+
- build/*.pod_logs_dump/*
265266
retry:
266267
automatic:
267268
limit: 1

.buildkite/scripts/buildkite-k8s-integration-tests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ EOF
9090
fully_qualified_group_name="${CLUSTER_NAME}_${TARGET_ARCH}_${variant}"
9191
outputXML="build/${fully_qualified_group_name}.integration.xml"
9292
outputJSON="build/${fully_qualified_group_name}.integration.out.json"
93-
pod_logs_base="build/${fully_qualified_group_name}.integration"
93+
pod_logs_base="${PWD}/build/${fully_qualified_group_name}.pod_logs_dump"
9494

9595
set +e
9696
K8S_TESTS_POD_LOGS_BASE="${pod_logs_base}" AGENT_IMAGE="${image}" DOCKER_VARIANT="${variant}" gotestsum --hide-summary=skipped --format testname --no-color -f standard-quiet --junitfile "${outputXML}" --jsonfile "${outputJSON}" -- -tags kubernetes,integration -test.shuffle on -test.timeout 2h0m0s github.com/elastic/elastic-agent/testing/integration -v -args -integration.groups="${group_name}" -integration.sudo="false"

testing/integration/kubernetes_agent_standalone_test.go

Lines changed: 170 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package integration
88

99
import (
10+
"archive/tar"
1011
"bufio"
1112
"bytes"
1213
"context"
@@ -24,6 +25,7 @@ import (
2425
"testing"
2526
"time"
2627

28+
"github.com/gofrs/uuid/v5"
2729
"github.com/stretchr/testify/require"
2830

2931
"github.com/elastic/elastic-agent-libs/kibana"
@@ -791,8 +793,28 @@ func k8sCheckAgentStatus(ctx context.Context, client klient.Client, stdout *byte
791793
namespace string, agentPodName string, containerName string, componentPresence map[string]bool,
792794
) error {
793795
command := []string{"elastic-agent", "status", "--output=json"}
796+
stopCheck := errors.New("stop check")
797+
798+
// we will wait maximum 120 seconds for the agent to report healthy
799+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
800+
defer cancel()
794801

795802
checkStatus := func() error {
803+
pod := corev1.Pod{}
804+
if err := client.Resources(namespace).Get(ctx, agentPodName, namespace, &pod); err != nil {
805+
return err
806+
}
807+
808+
for _, container := range pod.Status.ContainerStatuses {
809+
if container.Name != containerName {
810+
continue
811+
}
812+
813+
if restarts := container.RestartCount; restarts != 0 {
814+
return fmt.Errorf("container %q of pod %q has restarted %d times: %w", containerName, agentPodName, restarts, stopCheck)
815+
}
816+
}
817+
796818
status := atesting.AgentStatusOutput{} // clear status output
797819
stdout.Reset()
798820
stderr.Reset()
@@ -823,16 +845,14 @@ func k8sCheckAgentStatus(ctx context.Context, client klient.Client, stdout *byte
823845
}
824846
return err
825847
}
826-
827-
// we will wait maximum 120 seconds for the agent to report healthy
828-
timeoutCtx, timeoutCancel := context.WithTimeout(ctx, 120*time.Second)
829-
defer timeoutCancel()
830848
for {
831849
err := checkStatus()
832850
if err == nil {
833851
return nil
852+
} else if errors.Is(err, stopCheck) {
853+
return err
834854
}
835-
if timeoutCtx.Err() != nil {
855+
if ctx.Err() != nil {
836856
// timeout waiting for agent to become healthy
837857
return errors.Join(err, errors.New("timeout waiting for agent to become healthy"))
838858
}
@@ -848,7 +868,10 @@ func k8sGetAgentID(ctx context.Context, client klient.Client, stdout *bytes.Buff
848868
status := atesting.AgentStatusOutput{} // clear status output
849869
stdout.Reset()
850870
stderr.Reset()
851-
if err := client.Resources().ExecInPod(ctx, namespace, agentPodName, containerName, command, stdout, stderr); err != nil {
871+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
872+
err := client.Resources().ExecInPod(ctx, namespace, agentPodName, containerName, command, stdout, stderr)
873+
cancel()
874+
if err != nil {
852875
return "", err
853876
}
854877

@@ -869,58 +892,157 @@ func getAgentComponentState(status atesting.AgentStatusOutput, componentName str
869892
return -1, false
870893
}
871894

872-
// k8sDumpAllPodLogs dumps the logs of all pods in the given namespace to the given target directory
873-
func k8sDumpAllPodLogs(ctx context.Context, client klient.Client, testName string, namespace string, targetDir string) error {
874-
podList := &corev1.PodList{}
895+
// k8sDumpPods creates an archive that contains logs of all pods in the given namespace and kube-system to the given target directory
896+
func k8sDumpPods(t *testing.T, ctx context.Context, client klient.Client, testName string, namespace string, targetDir string, testStartTime time.Time) {
897+
// Create the tar file
898+
archivePath := filepath.Join(targetDir, fmt.Sprintf("%s.tar.gz", namespace))
899+
tarFile, err := os.Create(archivePath)
900+
if err != nil {
901+
t.Logf("failed to create archive at path %q", archivePath)
902+
return
903+
}
904+
defer tarFile.Close()
905+
906+
t.Logf("archive %q contains the dump info for %q test", archivePath, testName)
907+
908+
// Create a new tar writer
909+
tarWriter := tar.NewWriter(tarFile)
910+
defer tarWriter.Close()
875911

876912
clientSet, err := kubernetes.NewForConfig(client.RESTConfig())
877913
if err != nil {
878-
return fmt.Errorf("error creating clientset: %w", err)
914+
t.Logf("error creating clientset: %s", err)
915+
return
879916
}
880917

881-
err = client.Resources(namespace).List(ctx, podList)
918+
podList := &corev1.PodList{}
919+
err = client.Resources("").List(ctx, podList)
882920
if err != nil {
883-
return fmt.Errorf("error listing pods: %w", err)
921+
t.Logf("error listing pods: %s", err)
922+
return
884923
}
885924

886-
var errs error
925+
type containerPodState struct {
926+
Namespace string `json:"namespace"`
927+
PodName string `json:"pod_name"`
928+
ContainerName string `json:"container_name"`
929+
RestartCount int32 `json:"restart_count"`
930+
LastTerminationReason string `json:"last_termination_reason"`
931+
LastTerminationMessage string `json:"last_termination_message"`
932+
}
933+
934+
var statesDump []containerPodState
935+
887936
for _, pod := range podList.Items {
888-
previous := false
889-
for _, containerStatus := range pod.Status.ContainerStatuses {
890-
if containerStatus.RestartCount > 0 {
891-
previous = true
892-
break
893-
}
937+
podNamespace := pod.GetNamespace()
938+
if podNamespace != namespace && podNamespace != "kube-system" {
939+
continue
894940
}
895941

896942
for _, container := range pod.Spec.Containers {
897-
logFilePath := filepath.Join(targetDir, fmt.Sprintf("%s-%s-%s.log", testName, pod.Name, container.Name))
898-
logFile, err := os.Create(logFilePath)
899-
if err != nil {
900-
errs = errors.Join(fmt.Errorf("error creating log file: %w", err), errs)
901-
continue
943+
state := containerPodState{
944+
Namespace: podNamespace,
945+
PodName: pod.GetName(),
946+
ContainerName: container.Name,
902947
}
948+
previous := false
903949

904-
req := clientSet.CoreV1().Pods(namespace).GetLogs(pod.Name, &corev1.PodLogOptions{
950+
for _, containerStatus := range pod.Status.ContainerStatuses {
951+
if container.Name != containerStatus.Name {
952+
continue
953+
}
954+
955+
state.RestartCount = containerStatus.RestartCount
956+
if containerStatus.RestartCount == 0 {
957+
break
958+
}
959+
// since we dump logs from pods that are expected to constantly run,
960+
// namely kube-apiserver in kube-system namespace, we need to identify
961+
// if a restart of such pod happened during the test to correctly if we
962+
// want previous log
963+
containerTerminated := containerStatus.LastTerminationState.Terminated
964+
if containerTerminated != nil && containerTerminated.FinishedAt.After(testStartTime) {
965+
previous = true
966+
state.LastTerminationReason = containerTerminated.Reason
967+
state.LastTerminationMessage = containerTerminated.Message
968+
}
969+
break
970+
}
971+
statesDump = append(statesDump, state)
972+
973+
var logFileName string
974+
if previous {
975+
logFileName = fmt.Sprintf("%s-%s-%s-previous.log", podNamespace, pod.Name, container.Name)
976+
} else {
977+
logFileName = fmt.Sprintf("%s-%s-%s.log", podNamespace, pod.Name, container.Name)
978+
}
979+
980+
req := clientSet.CoreV1().Pods(podNamespace).GetLogs(pod.Name, &corev1.PodLogOptions{
905981
Container: container.Name,
906982
Previous: previous,
983+
SinceTime: &metav1.Time{Time: testStartTime},
907984
})
908-
podLogsStream, err := req.Stream(context.TODO())
985+
986+
streamCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
987+
podLogsStream, err := req.Stream(streamCtx)
909988
if err != nil {
910-
errs = errors.Join(fmt.Errorf("error getting container %s of pod %s logs: %w", container.Name, pod.Name, err), errs)
989+
cancel()
990+
t.Logf("error getting container %q of pod %q logs: %s", container.Name, pod.Name, err)
911991
continue
912992
}
913993

914-
_, err = io.Copy(logFile, podLogsStream)
994+
b, err := io.ReadAll(podLogsStream)
995+
_ = podLogsStream.Close()
996+
cancel()
915997
if err != nil {
916-
errs = errors.Join(fmt.Errorf("error writing container %s of pod %s logs: %w", container.Name, pod.Name, err), errs)
998+
t.Logf("error reading container %q logs of pod %q: %s", container.Name, pod.Name, err)
999+
continue
9171000
}
9181001

919-
_ = podLogsStream.Close()
1002+
header := &tar.Header{
1003+
Name: logFileName,
1004+
Size: int64(len(b)),
1005+
Mode: 0600,
1006+
ModTime: time.Now(),
1007+
AccessTime: time.Now(),
1008+
ChangeTime: time.Now(),
1009+
}
1010+
1011+
if err := tarWriter.WriteHeader(header); err != nil {
1012+
t.Logf("error writing header of file %q in archive: %s", logFileName, err)
1013+
continue
1014+
}
1015+
1016+
if _, err := tarWriter.Write(b); err != nil {
1017+
t.Logf("error writing data of file %q in archive: %s", logFileName, err)
1018+
}
9201019
}
9211020
}
9221021

923-
return errs
1022+
b, err := json.Marshal(statesDump)
1023+
if err != nil {
1024+
t.Logf("error marshalling pod states: %s", err)
1025+
return
1026+
}
1027+
1028+
statesDumpFile := "containerPodsStates.json"
1029+
header := &tar.Header{
1030+
Name: statesDumpFile,
1031+
Size: int64(len(b)),
1032+
Mode: 0600,
1033+
ModTime: time.Now(),
1034+
AccessTime: time.Now(),
1035+
ChangeTime: time.Now(),
1036+
}
1037+
1038+
if err := tarWriter.WriteHeader(header); err != nil {
1039+
t.Logf("error writing header of file %q in archive: %s", statesDumpFile, err)
1040+
return
1041+
}
1042+
1043+
if _, err := tarWriter.Write(b); err != nil {
1044+
t.Logf("error writing data of file %q in archive: %s", statesDumpFile, err)
1045+
}
9241046
}
9251047

9261048
// k8sKustomizeAdjustObjects adjusts the namespace of given k8s objects and calls the given callbacks for the containers and the pod
@@ -1142,12 +1264,10 @@ func k8sCreateObjects(ctx context.Context, client klient.Client, opts k8sCreateO
11421264
for idx := range objWithType.Subjects {
11431265
objWithType.Subjects[idx].Namespace = opts.namespace
11441266
}
1145-
continue
11461267
case *rbacv1.RoleBinding:
11471268
for idx := range objWithType.Subjects {
11481269
objWithType.Subjects[idx].Namespace = opts.namespace
11491270
}
1150-
continue
11511271
}
11521272
}
11531273
if err := client.Resources().Create(ctx, obj); err != nil {
@@ -1260,12 +1380,18 @@ type k8sContext struct {
12601380
esAPIKey string
12611381
// enrollParams contains the information needed to enroll an agent with Fleet in the test
12621382
enrollParams *fleettools.EnrollParams
1383+
// createdAt is the time when the k8sContext was created
1384+
createdAt time.Time
12631385
}
12641386

12651387
// getNamespace returns a unique namespace for the current test
1266-
func (k8sContext) getNamespace(t *testing.T) string {
1388+
func (k k8sContext) getNamespace(t *testing.T) string {
1389+
nsUUID, err := uuid.NewV4()
1390+
if err != nil {
1391+
t.Fatalf("error generating namespace UUID: %v", err)
1392+
}
12671393
hasher := sha256.New()
1268-
hasher.Write([]byte(t.Name()))
1394+
hasher.Write([]byte(nsUUID.String()))
12691395
testNamespace := strings.ToLower(base64.URLEncoding.EncodeToString(hasher.Sum(nil)))
12701396
return noSpecialCharsRegexp.ReplaceAllString(testNamespace, "")
12711397
}
@@ -1323,7 +1449,7 @@ func k8sGetContext(t *testing.T, info *define.Info) k8sContext {
13231449
testLogsBasePath := os.Getenv("K8S_TESTS_POD_LOGS_BASE")
13241450
require.NotEmpty(t, testLogsBasePath, "K8S_TESTS_POD_LOGS_BASE must be set")
13251451

1326-
err = os.MkdirAll(filepath.Join(testLogsBasePath, t.Name()), 0o755)
1452+
err = os.MkdirAll(testLogsBasePath, 0o755)
13271453
require.NoError(t, err, "failed to create test logs directory")
13281454

13291455
esHost := os.Getenv("ELASTICSEARCH_HOST")
@@ -1346,6 +1472,7 @@ func k8sGetContext(t *testing.T, info *define.Info) k8sContext {
13461472
esHost: esHost,
13471473
esAPIKey: esAPIKey,
13481474
enrollParams: enrollParams,
1475+
createdAt: time.Now(),
13491476
}
13501477
}
13511478

@@ -1470,9 +1597,7 @@ func k8sStepDeployKustomize(kustomizePath string, containerName string, override
14701597

14711598
t.Cleanup(func() {
14721599
if t.Failed() {
1473-
if err := k8sDumpAllPodLogs(ctx, kCtx.client, namespace, namespace, kCtx.logsBasePath); err != nil {
1474-
t.Logf("failed to dump logs: %v", err)
1475-
}
1600+
k8sDumpPods(t, ctx, kCtx.client, t.Name(), namespace, kCtx.logsBasePath, kCtx.createdAt)
14761601
}
14771602

14781603
err := k8sDeleteObjects(ctx, kCtx.client, k8sDeleteOpts{wait: true}, objects...)
@@ -1481,7 +1606,7 @@ func k8sStepDeployKustomize(kustomizePath string, containerName string, override
14811606
}
14821607
})
14831608

1484-
err = k8sCreateObjects(ctx, kCtx.client, k8sCreateOpts{wait: true}, objects...)
1609+
err = k8sCreateObjects(ctx, kCtx.client, k8sCreateOpts{wait: true, namespace: namespace}, objects...)
14851610
require.NoError(t, err, "failed to create objects")
14861611
}
14871612
}
@@ -1544,8 +1669,10 @@ func k8sStepRunInnerTests(agentPodLabelSelector string, expectedPodNumber int, c
15441669

15451670
for _, pod := range perNodePodList.Items {
15461671
var stdout, stderr bytes.Buffer
1672+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
15471673
err = kCtx.client.Resources().ExecInPod(ctx, namespace, pod.Name, containerName,
15481674
[]string{"/usr/share/elastic-agent/k8s-inner-tests", "-test.v"}, &stdout, &stderr)
1675+
cancel()
15491676
t.Logf("%s k8s-inner-tests output:", pod.Name)
15501677
t.Log(stdout.String())
15511678
if err != nil {
@@ -1590,9 +1717,7 @@ func k8sStepHelmDeploy(chartPath string, releaseName string, values map[string]a
15901717

15911718
t.Cleanup(func() {
15921719
if t.Failed() {
1593-
if err := k8sDumpAllPodLogs(ctx, kCtx.client, namespace, namespace, kCtx.logsBasePath); err != nil {
1594-
t.Logf("failed to dump logs: %v", err)
1595-
}
1720+
k8sDumpPods(t, ctx, kCtx.client, t.Name(), namespace, kCtx.logsBasePath, kCtx.createdAt)
15961721
}
15971722

15981723
uninstallAction := action.NewUninstall(actionConfig)
@@ -1709,7 +1834,9 @@ func k8sStepCheckRestrictUpgrade(agentPodLabelSelector string, expectedPodNumber
17091834
var stdout, stderr bytes.Buffer
17101835

17111836
command := []string{"elastic-agent", "upgrade", "1.0.0"}
1837+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
17121838
err := kCtx.client.Resources().ExecInPod(ctx, namespace, pod.Name, containerName, command, &stdout, &stderr)
1839+
cancel()
17131840
require.Error(t, err)
17141841
require.Contains(t, stderr.String(), coordinator.ErrNotUpgradable.Error())
17151842
}

0 commit comments

Comments
 (0)