Skip to content

Commit da10015

Browse files
committed
Add debug log for test case TestProfileBundleDefaultIsKept
Collect must-gather Add operator logs Add a fix to address the cache staleness race condition
1 parent 0094b27 commit da10015

File tree

2 files changed

+248
-1
lines changed

2 files changed

+248
-1
lines changed

pkg/controller/profilebundle/profilebundle_controller.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,18 @@ func (r *ReconcileProfileBundle) Reconcile(ctx context.Context, request reconcil
252252
// Pod already exists and its init container at least ran - don't requeue
253253
reqLogger.Info("Skip reconcile: Workload already up-to-date", "Deployment.Namespace", found.Namespace, "Deployment.Name", found.Name)
254254

255+
// If status is PENDING but we're about to skip reconcile, this indicates
256+
// a cache staleness race condition (e.g., after operator restart where the ProfileBundle
257+
// was updated but we read stale cached data). Force a requeue to re-read from API server.
258+
if instance.Status.DataStreamStatus == compliancev1alpha1.DataStreamPending {
259+
reqLogger.Info("ProfileBundle is PENDING but workload appears up-to-date - likely cache staleness, requeuing",
260+
"ProfileBundle.ContentImage", instance.Spec.ContentImage,
261+
"Pod.Name", relevantPod.Name,
262+
"Pod.Phase", relevantPod.Status.Phase)
263+
// Requeue to give cache time to sync and re-read the ProfileBundle
264+
return reconcile.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
265+
}
266+
255267
// Handle upgrades
256268
if instance.Status.DataStreamStatus == compliancev1alpha1.DataStreamValid &&
257269
instance.Status.Conditions.GetCondition("Ready") == nil {

tests/e2e/framework/common.go

Lines changed: 236 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import (
88
"io"
99
"log"
1010
"os"
11+
"os/exec"
12+
"path/filepath"
1113
"regexp"
1214
"strconv"
1315
"strings"
@@ -20,6 +22,7 @@ import (
2022
imagev1 "github.com/openshift/api/image/v1"
2123
mcfgapi "github.com/openshift/api/machineconfiguration"
2224
mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
25+
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
2326
batchv1 "k8s.io/api/batch/v1"
2427
core "k8s.io/api/core/v1"
2528
corev1 "k8s.io/api/core/v1"
@@ -32,6 +35,7 @@ import (
3235
clusterv1alpha1 "open-cluster-management.io/api/cluster/v1alpha1"
3336

3437
"k8s.io/apimachinery/pkg/labels"
38+
"k8s.io/apimachinery/pkg/runtime/schema"
3539
"k8s.io/apimachinery/pkg/types"
3640
"k8s.io/apimachinery/pkg/util/wait"
3741
"k8s.io/client-go/kubernetes"
@@ -122,6 +126,158 @@ func (f *Framework) cleanUpFromYAMLFile(p *string) error {
122126
return nil
123127
}
124128

129+
// CollectMustGather collects must-gather data from the cluster for debugging test failures.
130+
// It attempts to determine the must-gather image from the CSV and runs oc adm must-gather.
131+
// If the CSV cannot be found or the must-gather image is not available, it falls back to
132+
// using a default image or skips collection.
133+
// The output is saved to ARTIFACT_DIR if set, otherwise to the current directory.
134+
func (f *Framework) CollectMustGather(testName string) {
135+
timestamp := time.Now().Format("20060102-150405")
136+
outputDirName := fmt.Sprintf("must-gather-%s-%s", testName, timestamp)
137+
138+
// Use ARTIFACT_DIR if set (for CI), otherwise use current directory
139+
baseDir := os.Getenv("ARTIFACT_DIR")
140+
var outputDir string
141+
if baseDir != "" {
142+
outputDir = filepath.Join(baseDir, outputDirName)
143+
log.Printf("Collecting must-gather data to artifact directory: %s\n", outputDir)
144+
} else {
145+
outputDir = outputDirName
146+
log.Printf("Collecting must-gather data to current directory: %s\n", outputDir)
147+
}
148+
149+
// Try to get the must-gather image from the CSV
150+
mustGatherImage := ""
151+
152+
// List all CSVs in the operator namespace
153+
csvList := &unstructured.UnstructuredList{}
154+
csvList.SetGroupVersionKind(schema.GroupVersionKind{
155+
Group: "operators.coreos.com",
156+
Version: "v1alpha1",
157+
Kind: "ClusterServiceVersionList",
158+
})
159+
160+
if err := f.Client.List(context.TODO(), csvList, client.InNamespace(f.OperatorNamespace)); err != nil {
161+
log.Printf("Warning: Failed to list CSVs: %v\n", err)
162+
} else {
163+
// Find the compliance-operator CSV
164+
for _, csv := range csvList.Items {
165+
if strings.Contains(csv.GetName(), "compliance-operator") {
166+
// Extract must-gather image from relatedImages
167+
relatedImages, found, err := unstructured.NestedSlice(csv.Object, "spec", "relatedImages")
168+
if err == nil && found {
169+
for _, img := range relatedImages {
170+
imgMap, ok := img.(map[string]interface{})
171+
if !ok {
172+
continue
173+
}
174+
name, _, _ := unstructured.NestedString(imgMap, "name")
175+
if name == "must-gather" {
176+
image, _, _ := unstructured.NestedString(imgMap, "image")
177+
mustGatherImage = image
178+
break
179+
}
180+
}
181+
}
182+
break
183+
}
184+
}
185+
}
186+
187+
// If we couldn't find the image from CSV, use the default
188+
if mustGatherImage == "" {
189+
mustGatherImage = "ghcr.io/complianceascode/must-gather-ocp:latest"
190+
log.Printf("Using default must-gather image: %s\n", mustGatherImage)
191+
} else {
192+
log.Printf("Using must-gather image from CSV: %s\n", mustGatherImage)
193+
}
194+
195+
// First, collect operator logs directly from the operator namespace
196+
// This is critical for osdk-e2e namespaces which may not be included in must-gather
197+
log.Printf("Collecting operator logs from namespace: %s\n", f.OperatorNamespace)
198+
f.CollectOperatorLogs(outputDir)
199+
200+
// Run oc adm must-gather
201+
log.Printf("Running: oc adm must-gather --image=%s --dest-dir=%s\n", mustGatherImage, outputDir)
202+
203+
cmd := exec.Command("oc", "adm", "must-gather", fmt.Sprintf("--image=%s", mustGatherImage), fmt.Sprintf("--dest-dir=%s", outputDir))
204+
cmd.Stdout = os.Stdout
205+
cmd.Stderr = os.Stderr
206+
207+
if err := cmd.Run(); err != nil {
208+
log.Printf("Warning: Failed to collect must-gather data: %v\n", err)
209+
log.Printf("Must-gather data was not collected, but test will continue\n")
210+
} else {
211+
log.Printf("Successfully collected must-gather data to %s\n", outputDir)
212+
log.Printf("You can find the must-gather archive in the current directory\n")
213+
}
214+
}
215+
216+
// CollectOperatorLogs collects logs and debug information from the operator namespace.
217+
// This is especially important for osdk-e2e test namespaces that may not be included in must-gather.
218+
// Uses 'oc adm inspect' to collect comprehensive debugging information and creates a tarball.
219+
func (f *Framework) CollectOperatorLogs(outputDir string) {
220+
log.Printf("Collecting logs from operator namespace: %s\n", f.OperatorNamespace)
221+
222+
// Create a temporary directory for oc adm inspect output
223+
tempDir, err := os.MkdirTemp("", "operator-inspect-*")
224+
if err != nil {
225+
log.Printf("Warning: Failed to create temp directory: %v\n", err)
226+
return
227+
}
228+
defer os.RemoveAll(tempDir)
229+
230+
// Run oc adm inspect to collect namespace data
231+
log.Printf("Running: oc adm inspect namespace/%s --dest-dir=%s\n", f.OperatorNamespace, tempDir)
232+
inspectCmd := exec.Command("oc", "adm", "inspect", fmt.Sprintf("namespace/%s", f.OperatorNamespace), fmt.Sprintf("--dest-dir=%s", tempDir))
233+
inspectCmd.Stdout = os.Stdout
234+
inspectCmd.Stderr = os.Stderr
235+
236+
if err := inspectCmd.Run(); err != nil {
237+
log.Printf("Warning: Failed to run oc adm inspect: %v\n", err)
238+
return
239+
}
240+
241+
log.Printf("Successfully collected operator namespace data with oc adm inspect\n")
242+
243+
// Determine the output location for the tarball
244+
var tarballPath string
245+
artifactDir := os.Getenv("ARTIFACT_DIR")
246+
if artifactDir != "" {
247+
// Use ARTIFACT_DIR if set (CI environment)
248+
if err := os.MkdirAll(artifactDir, 0755); err != nil {
249+
log.Printf("Warning: Failed to create artifact directory %s: %v\n", artifactDir, err)
250+
return
251+
}
252+
timestamp := time.Now().Format("20060102-150405")
253+
tarballPath = filepath.Join(artifactDir, fmt.Sprintf("operator-logs-%s-%s.tar.gz", f.OperatorNamespace, timestamp))
254+
} else {
255+
// Fall back to the provided output directory
256+
if err := os.MkdirAll(outputDir, 0755); err != nil {
257+
log.Printf("Warning: Failed to create output directory %s: %v\n", outputDir, err)
258+
return
259+
}
260+
timestamp := time.Now().Format("20060102-150405")
261+
tarballPath = filepath.Join(outputDir, fmt.Sprintf("operator-logs-%s-%s.tar.gz", f.OperatorNamespace, timestamp))
262+
}
263+
264+
// Create tarball of the collected data
265+
log.Printf("Creating tarball: %s\n", tarballPath)
266+
tarCmd := exec.Command("tar", "-czf", tarballPath, "-C", tempDir, ".")
267+
if tarOutput, err := tarCmd.CombinedOutput(); err != nil {
268+
log.Printf("Warning: Failed to create tarball: %v\nOutput: %s\n", err, string(tarOutput))
269+
return
270+
}
271+
272+
// Get tarball size
273+
fileInfo, err := os.Stat(tarballPath)
274+
if err != nil {
275+
log.Printf("Successfully created operator logs tarball at: %s\n", tarballPath)
276+
} else {
277+
log.Printf("Successfully created operator logs tarball at: %s (size: %d bytes)\n", tarballPath, fileInfo.Size())
278+
}
279+
}
280+
125281
func (f *Framework) PrintROSADebugInfo(t *testing.T) {
126282
// List cluster claims
127283
clusterClaimList := clusterv1alpha1.ClusterClaimList{}
@@ -337,6 +493,16 @@ func (f *Framework) addFrameworks() error {
337493
}
338494
}
339495

496+
// ValidatingAdmissionPolicy objects
497+
vapObjs := [1]dynclient.ObjectList{
498+
&admissionregistrationv1.ValidatingAdmissionPolicyList{},
499+
}
500+
for _, obj := range vapObjs {
501+
if err := AddToFrameworkScheme(admissionregistrationv1.AddToScheme, obj); err != nil {
502+
return fmt.Errorf("failed to add admissionregistration resource scheme to framework: %v", err)
503+
}
504+
}
505+
340506
return nil
341507
}
342508

@@ -580,7 +746,27 @@ func (f *Framework) WaitForProfileBundleStatus(name string, status compv1alpha1.
580746
return false, nil
581747
})
582748
if timeouterr != nil {
583-
return fmt.Errorf("ProfileBundle %s failed to reach state %s", name, status)
749+
// Log basic information about the failure
750+
log.Printf("ProfileBundle %s failed to reach state %s\n", name, status)
751+
log.Printf("Current status: %s\n", pb.Status.DataStreamStatus)
752+
log.Printf("ContentImage: %s\n", pb.Spec.ContentImage)
753+
log.Printf("ContentFile: %s\n", pb.Spec.ContentFile)
754+
755+
if pb.Status.ErrorMessage != "" {
756+
log.Printf("ErrorMessage: %s\n", pb.Status.ErrorMessage)
757+
}
758+
759+
// Collect must-gather data for comprehensive debugging
760+
log.Printf("Collecting must-gather data for detailed debugging information...\n")
761+
f.CollectMustGather(fmt.Sprintf("ProfileBundle-%s", name))
762+
763+
// Include error details in the returned error
764+
errMsg := fmt.Sprintf("ProfileBundle %s failed to reach state %s (current: %s)",
765+
name, status, pb.Status.DataStreamStatus)
766+
if pb.Status.ErrorMessage != "" {
767+
errMsg += fmt.Sprintf(", error: %s", pb.Status.ErrorMessage)
768+
}
769+
return fmt.Errorf("%s", errMsg)
584770
}
585771
log.Printf("ProfileBundle ready (%s)\n", pb.Status.DataStreamStatus)
586772
return nil
@@ -791,18 +977,62 @@ func (f *Framework) createMachineConfigPool(n string) error {
791977
return nil
792978
}
793979

980+
// validatingAdmissionPolicyExists checks if a ValidatingAdmissionPolicy with the given name exists
981+
func (f *Framework) validatingAdmissionPolicyExists(name string) (bool, error) {
982+
vap := &admissionregistrationv1.ValidatingAdmissionPolicy{}
983+
err := f.Client.Get(context.TODO(), types.NamespacedName{Name: name}, vap)
984+
if err != nil {
985+
if apierrors.IsNotFound(err) {
986+
return false, nil
987+
}
988+
return false, err
989+
}
990+
return true, nil
991+
}
992+
794993
func (f *Framework) createInvalidMachineConfigPool(n string) error {
795994
if f.Platform == "rosa" {
796995
fmt.Printf("bypassing MachineConfigPool test setup because it's not supported on %s\n", f.Platform)
797996
return nil
798997
}
998+
999+
// Check if ValidatingAdmissionPolicy exists
1000+
vapExists, err := f.validatingAdmissionPolicyExists("custom-machine-config-pool-selector")
1001+
if err != nil {
1002+
return fmt.Errorf("failed to check ValidatingAdmissionPolicy: %w", err)
1003+
}
1004+
7991005
p := &mcfgv1.MachineConfigPool{
8001006
ObjectMeta: metav1.ObjectMeta{Name: n},
8011007
Spec: mcfgv1.MachineConfigPoolSpec{
8021008
Paused: false,
8031009
},
8041010
}
8051011

1012+
// Only add selectors if ValidatingAdmissionPolicy exists
1013+
// This ensures backward compatibility with older clusters
1014+
if vapExists {
1015+
log.Printf("ValidatingAdmissionPolicy 'custom-machine-config-pool-selector' exists, adding minimal selectors to MachineConfigPool %s\n", n)
1016+
// Add minimal selectors to pass ValidatingAdmissionPolicy
1017+
// This pool is still "invalid" for testing as no nodes match this selector
1018+
p.Spec.NodeSelector = &metav1.LabelSelector{
1019+
MatchLabels: map[string]string{
1020+
"node-role.kubernetes.io/e2e-invalid": "",
1021+
},
1022+
}
1023+
p.Spec.MachineConfigSelector = &metav1.LabelSelector{
1024+
MatchExpressions: []metav1.LabelSelectorRequirement{
1025+
{
1026+
Key: "machineconfiguration.openshift.io/role",
1027+
Operator: metav1.LabelSelectorOpIn,
1028+
Values: []string{"worker"},
1029+
},
1030+
},
1031+
}
1032+
} else {
1033+
log.Printf("ValidatingAdmissionPolicy 'custom-machine-config-pool-selector' not found, creating MachineConfigPool %s without selectors (legacy mode)\n", n)
1034+
}
1035+
8061036
createErr := backoff.RetryNotify(
8071037
func() error {
8081038
err := f.Client.Create(context.TODO(), p, nil)
@@ -2945,3 +3175,8 @@ func (f *Framework) AssertRuleIsNodeType(ruleName, namespace string) error {
29453175
func (f *Framework) AssertRuleIsPlatformType(ruleName, namespace string) error {
29463176
return f.assertRuleType(ruleName, namespace, "Platform")
29473177
}
3178+
3179+
// int64Ptr returns a pointer to an int64 value
3180+
func int64Ptr(i int64) *int64 {
3181+
return &i
3182+
}

0 commit comments

Comments
 (0)