77package integration
88
99import (
10+ "archive/tar"
1011 "bufio"
1112 "bytes"
1213 "context"
@@ -24,6 +25,7 @@ import (
2425 "testing"
2526 "time"
2627
28+ "github.com/gofrs/uuid/v5"
2729 "github.com/stretchr/testify/require"
2830
2931 "github.com/elastic/elastic-agent-libs/kibana"
@@ -791,8 +793,28 @@ func k8sCheckAgentStatus(ctx context.Context, client klient.Client, stdout *byte
791793 namespace string , agentPodName string , containerName string , componentPresence map [string ]bool ,
792794) error {
793795 command := []string {"elastic-agent" , "status" , "--output=json" }
796+ stopCheck := errors .New ("stop check" )
797+
798+ // we will wait maximum 120 seconds for the agent to report healthy
799+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Minute )
800+ defer cancel ()
794801
795802 checkStatus := func () error {
803+ pod := corev1.Pod {}
804+ if err := client .Resources (namespace ).Get (ctx , agentPodName , namespace , & pod ); err != nil {
805+ return err
806+ }
807+
808+ for _ , container := range pod .Status .ContainerStatuses {
809+ if container .Name != containerName {
810+ continue
811+ }
812+
813+ if restarts := container .RestartCount ; restarts != 0 {
814+ return fmt .Errorf ("container %q of pod %q has restarted %d times: %w" , containerName , agentPodName , restarts , stopCheck )
815+ }
816+ }
817+
796818 status := atesting.AgentStatusOutput {} // clear status output
797819 stdout .Reset ()
798820 stderr .Reset ()
@@ -823,16 +845,14 @@ func k8sCheckAgentStatus(ctx context.Context, client klient.Client, stdout *byte
823845 }
824846 return err
825847 }
826-
827- // we will wait maximum 120 seconds for the agent to report healthy
828- timeoutCtx , timeoutCancel := context .WithTimeout (ctx , 120 * time .Second )
829- defer timeoutCancel ()
830848 for {
831849 err := checkStatus ()
832850 if err == nil {
833851 return nil
852+ } else if errors .Is (err , stopCheck ) {
853+ return err
834854 }
835- if timeoutCtx .Err () != nil {
855+ if ctx .Err () != nil {
836856 // timeout waiting for agent to become healthy
837857 return errors .Join (err , errors .New ("timeout waiting for agent to become healthy" ))
838858 }
@@ -848,7 +868,10 @@ func k8sGetAgentID(ctx context.Context, client klient.Client, stdout *bytes.Buff
848868 status := atesting.AgentStatusOutput {} // clear status output
849869 stdout .Reset ()
850870 stderr .Reset ()
851- if err := client .Resources ().ExecInPod (ctx , namespace , agentPodName , containerName , command , stdout , stderr ); err != nil {
871+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Minute )
872+ err := client .Resources ().ExecInPod (ctx , namespace , agentPodName , containerName , command , stdout , stderr )
873+ cancel ()
874+ if err != nil {
852875 return "" , err
853876 }
854877
@@ -869,58 +892,157 @@ func getAgentComponentState(status atesting.AgentStatusOutput, componentName str
869892 return - 1 , false
870893}
871894
872- // k8sDumpAllPodLogs dumps the logs of all pods in the given namespace to the given target directory
873- func k8sDumpAllPodLogs (ctx context.Context , client klient.Client , testName string , namespace string , targetDir string ) error {
874- podList := & corev1.PodList {}
895+ // k8sDumpPods creates an archive that contains logs of all pods in the given namespace and kube-system to the given target directory
896+ func k8sDumpPods (t * testing.T , ctx context.Context , client klient.Client , testName string , namespace string , targetDir string , testStartTime time.Time ) {
897+ // Create the tar file
898+ archivePath := filepath .Join (targetDir , fmt .Sprintf ("%s.tar.gz" , namespace ))
899+ tarFile , err := os .Create (archivePath )
900+ if err != nil {
901+ t .Logf ("failed to create archive at path %q" , archivePath )
902+ return
903+ }
904+ defer tarFile .Close ()
905+
906+ t .Logf ("archive %q contains the dump info for %q test" , archivePath , testName )
907+
908+ // Create a new tar writer
909+ tarWriter := tar .NewWriter (tarFile )
910+ defer tarWriter .Close ()
875911
876912 clientSet , err := kubernetes .NewForConfig (client .RESTConfig ())
877913 if err != nil {
878- return fmt .Errorf ("error creating clientset: %w" , err )
914+ t .Logf ("error creating clientset: %s" , err )
915+ return
879916 }
880917
881- err = client .Resources (namespace ).List (ctx , podList )
918+ podList := & corev1.PodList {}
919+ err = client .Resources ("" ).List (ctx , podList )
882920 if err != nil {
883- return fmt .Errorf ("error listing pods: %w" , err )
921+ t .Logf ("error listing pods: %s" , err )
922+ return
884923 }
885924
886- var errs error
925+ type containerPodState struct {
926+ Namespace string `json:"namespace"`
927+ PodName string `json:"pod_name"`
928+ ContainerName string `json:"container_name"`
929+ RestartCount int32 `json:"restart_count"`
930+ LastTerminationReason string `json:"last_termination_reason"`
931+ LastTerminationMessage string `json:"last_termination_message"`
932+ }
933+
934+ var statesDump []containerPodState
935+
887936 for _ , pod := range podList .Items {
888- previous := false
889- for _ , containerStatus := range pod .Status .ContainerStatuses {
890- if containerStatus .RestartCount > 0 {
891- previous = true
892- break
893- }
937+ podNamespace := pod .GetNamespace ()
938+ if podNamespace != namespace && podNamespace != "kube-system" {
939+ continue
894940 }
895941
896942 for _ , container := range pod .Spec .Containers {
897- logFilePath := filepath .Join (targetDir , fmt .Sprintf ("%s-%s-%s.log" , testName , pod .Name , container .Name ))
898- logFile , err := os .Create (logFilePath )
899- if err != nil {
900- errs = errors .Join (fmt .Errorf ("error creating log file: %w" , err ), errs )
901- continue
943+ state := containerPodState {
944+ Namespace : podNamespace ,
945+ PodName : pod .GetName (),
946+ ContainerName : container .Name ,
902947 }
948+ previous := false
903949
904- req := clientSet .CoreV1 ().Pods (namespace ).GetLogs (pod .Name , & corev1.PodLogOptions {
950+ for _ , containerStatus := range pod .Status .ContainerStatuses {
951+ if container .Name != containerStatus .Name {
952+ continue
953+ }
954+
955+ state .RestartCount = containerStatus .RestartCount
956+ if containerStatus .RestartCount == 0 {
957+ break
958+ }
959+ // since we dump logs from pods that are expected to constantly run,
960+ // namely kube-apiserver in kube-system namespace, we need to identify
961+ // if a restart of such pod happened during the test to correctly if we
962+ // want previous log
963+ containerTerminated := containerStatus .LastTerminationState .Terminated
964+ if containerTerminated != nil && containerTerminated .FinishedAt .After (testStartTime ) {
965+ previous = true
966+ state .LastTerminationReason = containerTerminated .Reason
967+ state .LastTerminationMessage = containerTerminated .Message
968+ }
969+ break
970+ }
971+ statesDump = append (statesDump , state )
972+
973+ var logFileName string
974+ if previous {
975+ logFileName = fmt .Sprintf ("%s-%s-%s-previous.log" , podNamespace , pod .Name , container .Name )
976+ } else {
977+ logFileName = fmt .Sprintf ("%s-%s-%s.log" , podNamespace , pod .Name , container .Name )
978+ }
979+
980+ req := clientSet .CoreV1 ().Pods (podNamespace ).GetLogs (pod .Name , & corev1.PodLogOptions {
905981 Container : container .Name ,
906982 Previous : previous ,
983+ SinceTime : & metav1.Time {Time : testStartTime },
907984 })
908- podLogsStream , err := req .Stream (context .TODO ())
985+
986+ streamCtx , cancel := context .WithTimeout (ctx , 10 * time .Second )
987+ podLogsStream , err := req .Stream (streamCtx )
909988 if err != nil {
910- errs = errors .Join (fmt .Errorf ("error getting container %s of pod %s logs: %w" , container .Name , pod .Name , err ), errs )
989+ cancel ()
990+ t .Logf ("error getting container %q of pod %q logs: %s" , container .Name , pod .Name , err )
911991 continue
912992 }
913993
914- _ , err = io .Copy (logFile , podLogsStream )
994+ b , err := io .ReadAll (podLogsStream )
995+ _ = podLogsStream .Close ()
996+ cancel ()
915997 if err != nil {
916- errs = errors .Join (fmt .Errorf ("error writing container %s of pod %s logs: %w" , container .Name , pod .Name , err ), errs )
998+ t .Logf ("error reading container %q logs of pod %q: %s" , container .Name , pod .Name , err )
999+ continue
9171000 }
9181001
919- _ = podLogsStream .Close ()
1002+ header := & tar.Header {
1003+ Name : logFileName ,
1004+ Size : int64 (len (b )),
1005+ Mode : 0600 ,
1006+ ModTime : time .Now (),
1007+ AccessTime : time .Now (),
1008+ ChangeTime : time .Now (),
1009+ }
1010+
1011+ if err := tarWriter .WriteHeader (header ); err != nil {
1012+ t .Logf ("error writing header of file %q in archive: %s" , logFileName , err )
1013+ continue
1014+ }
1015+
1016+ if _ , err := tarWriter .Write (b ); err != nil {
1017+ t .Logf ("error writing data of file %q in archive: %s" , logFileName , err )
1018+ }
9201019 }
9211020 }
9221021
923- return errs
1022+ b , err := json .Marshal (statesDump )
1023+ if err != nil {
1024+ t .Logf ("error marshalling pod states: %s" , err )
1025+ return
1026+ }
1027+
1028+ statesDumpFile := "containerPodsStates.json"
1029+ header := & tar.Header {
1030+ Name : statesDumpFile ,
1031+ Size : int64 (len (b )),
1032+ Mode : 0600 ,
1033+ ModTime : time .Now (),
1034+ AccessTime : time .Now (),
1035+ ChangeTime : time .Now (),
1036+ }
1037+
1038+ if err := tarWriter .WriteHeader (header ); err != nil {
1039+ t .Logf ("error writing header of file %q in archive: %s" , statesDumpFile , err )
1040+ return
1041+ }
1042+
1043+ if _ , err := tarWriter .Write (b ); err != nil {
1044+ t .Logf ("error writing data of file %q in archive: %s" , statesDumpFile , err )
1045+ }
9241046}
9251047
9261048// k8sKustomizeAdjustObjects adjusts the namespace of given k8s objects and calls the given callbacks for the containers and the pod
@@ -1142,12 +1264,10 @@ func k8sCreateObjects(ctx context.Context, client klient.Client, opts k8sCreateO
11421264 for idx := range objWithType .Subjects {
11431265 objWithType .Subjects [idx ].Namespace = opts .namespace
11441266 }
1145- continue
11461267 case * rbacv1.RoleBinding :
11471268 for idx := range objWithType .Subjects {
11481269 objWithType .Subjects [idx ].Namespace = opts .namespace
11491270 }
1150- continue
11511271 }
11521272 }
11531273 if err := client .Resources ().Create (ctx , obj ); err != nil {
@@ -1260,12 +1380,18 @@ type k8sContext struct {
12601380 esAPIKey string
12611381 // enrollParams contains the information needed to enroll an agent with Fleet in the test
12621382 enrollParams * fleettools.EnrollParams
1383+ // createdAt is the time when the k8sContext was created
1384+ createdAt time.Time
12631385}
12641386
12651387// getNamespace returns a unique namespace for the current test
1266- func (k8sContext ) getNamespace (t * testing.T ) string {
1388+ func (k k8sContext ) getNamespace (t * testing.T ) string {
1389+ nsUUID , err := uuid .NewV4 ()
1390+ if err != nil {
1391+ t .Fatalf ("error generating namespace UUID: %v" , err )
1392+ }
12671393 hasher := sha256 .New ()
1268- hasher .Write ([]byte (t . Name ()))
1394+ hasher .Write ([]byte (nsUUID . String ()))
12691395 testNamespace := strings .ToLower (base64 .URLEncoding .EncodeToString (hasher .Sum (nil )))
12701396 return noSpecialCharsRegexp .ReplaceAllString (testNamespace , "" )
12711397}
@@ -1323,7 +1449,7 @@ func k8sGetContext(t *testing.T, info *define.Info) k8sContext {
13231449 testLogsBasePath := os .Getenv ("K8S_TESTS_POD_LOGS_BASE" )
13241450 require .NotEmpty (t , testLogsBasePath , "K8S_TESTS_POD_LOGS_BASE must be set" )
13251451
1326- err = os .MkdirAll (filepath . Join ( testLogsBasePath , t . Name ()) , 0o755 )
1452+ err = os .MkdirAll (testLogsBasePath , 0o755 )
13271453 require .NoError (t , err , "failed to create test logs directory" )
13281454
13291455 esHost := os .Getenv ("ELASTICSEARCH_HOST" )
@@ -1346,6 +1472,7 @@ func k8sGetContext(t *testing.T, info *define.Info) k8sContext {
13461472 esHost : esHost ,
13471473 esAPIKey : esAPIKey ,
13481474 enrollParams : enrollParams ,
1475+ createdAt : time .Now (),
13491476 }
13501477}
13511478
@@ -1470,9 +1597,7 @@ func k8sStepDeployKustomize(kustomizePath string, containerName string, override
14701597
14711598 t .Cleanup (func () {
14721599 if t .Failed () {
1473- if err := k8sDumpAllPodLogs (ctx , kCtx .client , namespace , namespace , kCtx .logsBasePath ); err != nil {
1474- t .Logf ("failed to dump logs: %v" , err )
1475- }
1600+ k8sDumpPods (t , ctx , kCtx .client , t .Name (), namespace , kCtx .logsBasePath , kCtx .createdAt )
14761601 }
14771602
14781603 err := k8sDeleteObjects (ctx , kCtx .client , k8sDeleteOpts {wait : true }, objects ... )
@@ -1481,7 +1606,7 @@ func k8sStepDeployKustomize(kustomizePath string, containerName string, override
14811606 }
14821607 })
14831608
1484- err = k8sCreateObjects (ctx , kCtx .client , k8sCreateOpts {wait : true }, objects ... )
1609+ err = k8sCreateObjects (ctx , kCtx .client , k8sCreateOpts {wait : true , namespace : namespace }, objects ... )
14851610 require .NoError (t , err , "failed to create objects" )
14861611 }
14871612}
@@ -1544,8 +1669,10 @@ func k8sStepRunInnerTests(agentPodLabelSelector string, expectedPodNumber int, c
15441669
15451670 for _ , pod := range perNodePodList .Items {
15461671 var stdout , stderr bytes.Buffer
1672+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Minute )
15471673 err = kCtx .client .Resources ().ExecInPod (ctx , namespace , pod .Name , containerName ,
15481674 []string {"/usr/share/elastic-agent/k8s-inner-tests" , "-test.v" }, & stdout , & stderr )
1675+ cancel ()
15491676 t .Logf ("%s k8s-inner-tests output:" , pod .Name )
15501677 t .Log (stdout .String ())
15511678 if err != nil {
@@ -1590,9 +1717,7 @@ func k8sStepHelmDeploy(chartPath string, releaseName string, values map[string]a
15901717
15911718 t .Cleanup (func () {
15921719 if t .Failed () {
1593- if err := k8sDumpAllPodLogs (ctx , kCtx .client , namespace , namespace , kCtx .logsBasePath ); err != nil {
1594- t .Logf ("failed to dump logs: %v" , err )
1595- }
1720+ k8sDumpPods (t , ctx , kCtx .client , t .Name (), namespace , kCtx .logsBasePath , kCtx .createdAt )
15961721 }
15971722
15981723 uninstallAction := action .NewUninstall (actionConfig )
@@ -1709,7 +1834,9 @@ func k8sStepCheckRestrictUpgrade(agentPodLabelSelector string, expectedPodNumber
17091834 var stdout , stderr bytes.Buffer
17101835
17111836 command := []string {"elastic-agent" , "upgrade" , "1.0.0" }
1837+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Minute )
17121838 err := kCtx .client .Resources ().ExecInPod (ctx , namespace , pod .Name , containerName , command , & stdout , & stderr )
1839+ cancel ()
17131840 require .Error (t , err )
17141841 require .Contains (t , stderr .String (), coordinator .ErrNotUpgradable .Error ())
17151842 }
0 commit comments