Skip to content

Commit 5bb4358

Browse files
authored
Fix(support-bundle): improve copy from host collector performance by checking daemonset pod FailedMount event and retry (#1281)
1 parent 39314ef commit 5bb4358

File tree

5 files changed

+230
-14
lines changed

5 files changed

+230
-14
lines changed

pkg/collect/collectd.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,16 @@ func (c *CollectCollectd) Collect(progressChan chan<- interface{}) (CollectorRes
3939
}
4040

4141
rbacErrors := c.GetRBACErrors()
42-
copyFromHostCollector := &CollectCopyFromHost{copyFromHost, c.BundlePath, c.Namespace, c.ClientConfig, c.Client, c.Context, rbacErrors}
42+
copyFromHostCollector := &CollectCopyFromHost{
43+
Collector: copyFromHost,
44+
BundlePath: c.BundlePath,
45+
Namespace: c.Namespace,
46+
ClientConfig: c.ClientConfig,
47+
Client: c.Client,
48+
Context: c.Context,
49+
RetryFailedMount: false,
50+
RBACErrors: rbacErrors,
51+
}
4352

4453
return copyFromHostCollector.Collect(progressChan)
4554
}

pkg/collect/collector.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,16 @@ func GetCollector(collector *troubleshootv1beta2.Collect, bundlePath string, nam
8282
case collector.Copy != nil:
8383
return &CollectCopy{collector.Copy, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true
8484
case collector.CopyFromHost != nil:
85-
return &CollectCopyFromHost{collector.CopyFromHost, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true
85+
return &CollectCopyFromHost{
86+
Collector: collector.CopyFromHost,
87+
BundlePath: bundlePath,
88+
Namespace: namespace,
89+
ClientConfig: clientConfig,
90+
Client: client,
91+
Context: ctx,
92+
RetryFailedMount: true,
93+
RBACErrors: RBACErrors,
94+
}, true
8695
case collector.HTTP != nil:
8796
return &CollectHTTP{collector.HTTP, bundlePath, namespace, clientConfig, client, RBACErrors}, true
8897
case collector.Postgres != nil:

pkg/collect/copy_from_host.go

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,13 @@ import (
3131
)
3232

3333
type CollectCopyFromHost struct {
34-
Collector *troubleshootv1beta2.CopyFromHost
35-
BundlePath string
36-
Namespace string
37-
ClientConfig *rest.Config
38-
Client kubernetes.Interface
39-
Context context.Context
34+
Collector *troubleshootv1beta2.CopyFromHost
35+
BundlePath string
36+
Namespace string
37+
ClientConfig *rest.Config
38+
Client kubernetes.Interface
39+
Context context.Context
40+
RetryFailedMount bool
4041
RBACErrors
4142
}
4243

@@ -73,7 +74,7 @@ func (c *CollectCopyFromHost) Collect(progressChan chan<- interface{}) (Collecto
7374
namespace, _, _ = kubeconfig.Namespace()
7475
}
7576

76-
_, cleanup, err := copyFromHostCreateDaemonSet(c.Context, c.Client, c.Collector, hostDir, namespace, "troubleshoot-copyfromhost-", labels)
77+
_, cleanup, err := c.copyFromHostCreateDaemonSet(c.Context, c.Client, c.Collector, hostDir, namespace, "troubleshoot-copyfromhost-", labels)
7778
defer cleanup()
7879
if err != nil {
7980
return nil, errors.Wrap(err, "create daemonset")
@@ -125,7 +126,7 @@ func (c *CollectCopyFromHost) Collect(progressChan chan<- interface{}) (Collecto
125126
}
126127
}
127128

128-
func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interface, collector *troubleshootv1beta2.CopyFromHost, hostPath string, namespace string, generateName string, labels map[string]string) (name string, cleanup func(), err error) {
129+
func (c *CollectCopyFromHost) copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interface, collector *troubleshootv1beta2.CopyFromHost, hostPath string, namespace string, generateName string, labels map[string]string) (name string, cleanup func(), err error) {
129130
pullPolicy := corev1.PullIfNotPresent
130131
volumeType := corev1.HostPathDirectory
131132
if collector.ImagePullPolicy != "" {
@@ -229,6 +230,11 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
229230
for {
230231
select {
231232
case <-time.After(1 * time.Second):
233+
err = checkDaemonPodStatus(client, ctx, labels, namespace, c.RetryFailedMount)
234+
if err != nil {
235+
return createdDS.Name, cleanup, err
236+
}
237+
232238
case <-childCtx.Done():
233239
klog.V(2).Infof("Timed out waiting for daemonset %s to be ready", createdDS.Name)
234240
return createdDS.Name, cleanup, errors.Wrap(ctx.Err(), "wait for daemonset")
@@ -373,7 +379,14 @@ func copyFilesFromHost(ctx context.Context, dstPath string, clientConfig *restcl
373379

374380
func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS *appsv1.DaemonSet, namespace string, labels map[string]string) {
375381
klog.V(2).Infof("Daemonset %s has been scheduled for deletion", createdDS.Name)
376-
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
382+
zeroGracePeriod := int64(0)
383+
// Foreground is used to delete the DaemonSet pods before deleting the DaemonSet
384+
deletePropagationForeground := metav1.DeletePropagationForeground
385+
386+
if err := client.AppsV1().DaemonSets(namespace).Delete(ctx, createdDS.Name, metav1.DeleteOptions{
387+
GracePeriodSeconds: &zeroGracePeriod,
388+
PropagationPolicy: &deletePropagationForeground,
389+
}); err != nil {
377390
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
378391
return
379392
}
@@ -383,10 +396,24 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
383396
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
384397
}
385398

386-
dsPods := &corev1.PodList{}
399+
dsPods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: strings.Join(labelSelector, ",")})
400+
if err != nil {
401+
klog.Errorf("Failed to list pods for DaemonSet %s: %v", createdDS.Name, err)
402+
return
403+
}
404+
405+
for _, pod := range dsPods.Items {
406+
klog.V(2).Infof("Deleting pod %s", pod.Name)
407+
if err := client.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{
408+
GracePeriodSeconds: &zeroGracePeriod,
409+
}); err != nil {
410+
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
411+
}
412+
}
413+
387414
klog.V(2).Infof("Continuously poll each second for Pod deletion of DaemontSet %s for maximum %d seconds", createdDS.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
388415

389-
err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
416+
err = wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
390417
pods, listErr := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
391418
LabelSelector: strings.Join(labelSelector, ","),
392419
})
@@ -410,7 +437,6 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
410437
// If there was an error from the polling (e.g., the context deadline was exceeded before all pods were deleted),
411438
// delete each remaining pod with a zero-second grace period
412439
if err != nil {
413-
zeroGracePeriod := int64(0)
414440
for _, pod := range dsPods.Items {
415441
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
416442
err := client.CoreV1().Pods(namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
@@ -424,3 +450,34 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
424450
}
425451
}
426452
}
453+
454+
func checkDaemonPodStatus(client kubernetes.Interface, ctx context.Context, labels map[string]string, namespace string, retryFailedMount bool) error {
455+
var labelSelector []string
456+
for k, v := range labels {
457+
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
458+
}
459+
pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
460+
LabelSelector: strings.Join(labelSelector, ","),
461+
})
462+
if err != nil {
463+
return errors.Wrap(err, "get daemonset pods")
464+
}
465+
466+
for _, pod := range pods.Items {
467+
if pod.Status.Phase != corev1.PodRunning {
468+
events, _ := client.CoreV1().Events(namespace).List(ctx, metav1.ListOptions{
469+
FieldSelector: fmt.Sprintf("involvedObject.uid=%s", pod.UID),
470+
})
471+
472+
for _, event := range events.Items {
473+
// If the pod has a FailedMount event, it means that the pod failed to mount the volume and the pod will be stuck in the Pending state.
474+
// In this case, we return an error to the caller to indicate that path does not exist.
475+
if event.Reason == "FailedMount" && !retryFailedMount {
476+
klog.V(2).Infof("pod %s has a FailedMount event: %s", pod.Name, event.Message)
477+
return errors.Errorf("path does not exist")
478+
}
479+
}
480+
}
481+
}
482+
return nil
483+
}

pkg/collect/copy_from_host_test.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
package collect
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
"github.com/stretchr/testify/require"
8+
corev1 "k8s.io/api/core/v1"
9+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10+
testclient "k8s.io/client-go/kubernetes/fake"
11+
)
12+
13+
func Test_checkDaemonPodStatus(t *testing.T) {
14+
tests := []struct {
15+
name string
16+
namespace string
17+
podStatus corev1.PodPhase
18+
mockPod *corev1.Pod
19+
mockEvent *corev1.Event
20+
labels map[string]string
21+
retryFailedMount bool
22+
expectedErr bool
23+
eventMessage string
24+
}{
25+
{
26+
name: "Pod running without FailedMount event",
27+
namespace: "test",
28+
podStatus: corev1.PodRunning,
29+
labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
30+
mockPod: &corev1.Pod{
31+
ObjectMeta: metav1.ObjectMeta{
32+
Name: "test-pod",
33+
Namespace: "test",
34+
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
35+
},
36+
Status: corev1.PodStatus{
37+
Phase: corev1.PodRunning,
38+
},
39+
},
40+
expectedErr: false,
41+
},
42+
{
43+
name: "Pod not running without FailedMount event",
44+
namespace: "test",
45+
labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
46+
podStatus: corev1.PodPending,
47+
mockPod: &corev1.Pod{
48+
ObjectMeta: metav1.ObjectMeta{
49+
Name: "test-pod",
50+
Namespace: "test",
51+
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
52+
},
53+
Status: corev1.PodStatus{
54+
Phase: corev1.PodPending,
55+
},
56+
},
57+
expectedErr: false,
58+
},
59+
{
60+
name: "Pod running with FailedMount event and retryFailedMount disabled",
61+
namespace: "test",
62+
podStatus: corev1.PodRunning,
63+
mockPod: &corev1.Pod{
64+
ObjectMeta: metav1.ObjectMeta{
65+
Name: "test-pod",
66+
Namespace: "test",
67+
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
68+
},
69+
Status: corev1.PodStatus{
70+
Phase: corev1.PodPending,
71+
},
72+
},
73+
mockEvent: &corev1.Event{
74+
ObjectMeta: metav1.ObjectMeta{
75+
Name: "test-event",
76+
Namespace: "test",
77+
},
78+
Reason: "FailedMount",
79+
},
80+
retryFailedMount: false,
81+
expectedErr: true,
82+
eventMessage: `MountVolume.SetUp failed for volume "host" : hostPath type check failed: /var/lib/collectd is not a directory`,
83+
},
84+
{
85+
name: "Pod running with FailedMount event and retryFailedMount enabled",
86+
namespace: "test",
87+
podStatus: corev1.PodRunning,
88+
mockPod: &corev1.Pod{
89+
ObjectMeta: metav1.ObjectMeta{
90+
Name: "test-pod",
91+
Namespace: "test",
92+
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
93+
},
94+
Status: corev1.PodStatus{
95+
Phase: corev1.PodPending,
96+
},
97+
},
98+
mockEvent: &corev1.Event{
99+
ObjectMeta: metav1.ObjectMeta{
100+
Name: "test-event",
101+
Namespace: "test",
102+
},
103+
Reason: "FailedMount",
104+
},
105+
retryFailedMount: true,
106+
expectedErr: false,
107+
},
108+
}
109+
110+
for _, tt := range tests {
111+
t.Run(tt.name, func(t *testing.T) {
112+
ctx := context.Background()
113+
client := testclient.NewSimpleClientset()
114+
115+
if tt.mockPod != nil {
116+
pod, err := client.CoreV1().Pods(tt.namespace).Create(ctx, tt.mockPod, metav1.CreateOptions{})
117+
require.NoError(t, err)
118+
119+
if tt.mockEvent != nil {
120+
event := tt.mockEvent
121+
event.InvolvedObject = corev1.ObjectReference{
122+
UID: pod.UID,
123+
}
124+
_, err = client.CoreV1().Events(tt.namespace).Create(ctx, event, metav1.CreateOptions{})
125+
require.NoError(t, err)
126+
}
127+
}
128+
129+
err := checkDaemonPodStatus(client, ctx, tt.labels, tt.namespace, tt.retryFailedMount)
130+
if tt.expectedErr {
131+
require.Error(t, err)
132+
if tt.mockEvent != nil {
133+
require.Contains(t, err.Error(), "path does not exist")
134+
}
135+
} else {
136+
require.NoError(t, err)
137+
}
138+
})
139+
}
140+
}

pkg/constants/constants.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ const (
1414
// DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector.
1515
DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second
1616
// MAX_TIME_TO_WAIT_FOR_POD_DELETION is the maximum time to wait for pod deletion.
17+
// 0 seconds for force deletion.
1718
MAX_TIME_TO_WAIT_FOR_POD_DELETION = 60 * time.Second
1819
// Tracing constants
1920
LIB_TRACER_NAME = "github.com/replicatedhq/troubleshoot"

0 commit comments

Comments
 (0)