Skip to content

Commit 9a77538

Browse files
idler second known owner if idling the first one didn't work (codeready-toolchain#689)
1 parent c6505a7 commit 9a77538

File tree

4 files changed

+154
-37
lines changed

4 files changed

+154
-37
lines changed

controllers/idler/idler_controller.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -130,24 +130,30 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.
130130
return result, r.setStatusReady(ctx, idler)
131131
}
132132

133+
func getTimeout(idler *toolchainv1alpha1.Idler, pod corev1.Pod) int32 {
134+
timeoutSeconds := idler.Spec.TimeoutSeconds
135+
if isOwnedByVM(pod.ObjectMeta) {
136+
// use 1/12th of the timeout for VMs to have more aggressive idling to decrease
137+
// the infra costs because VMs consume much more resources
138+
timeoutSeconds = timeoutSeconds / 12
139+
}
140+
return timeoutSeconds
141+
}
142+
133143
func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1.Idler) (time.Duration, error) {
134144
// Get all pods running in the namespace
135145
podList := &corev1.PodList{}
136146
if err := r.AllNamespacesClient.List(ctx, podList, client.InNamespace(idler.Name)); err != nil {
137147
return 0, err
138148
}
139-
ownerIdler := newOwnerIdler(r.DiscoveryClient, r.DynamicClient, r.ScalesClient, r.RestClient)
149+
ownerIdler := newOwnerIdler(idler, r)
140150
requeueAfter := time.Duration(idler.Spec.TimeoutSeconds) * time.Second
141151
var idleErrors []error
142152
for _, pod := range podList.Items {
143153
podLogger := log.FromContext(ctx).WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase)
144154
podCtx := log.IntoContext(ctx, podLogger)
145-
timeoutSeconds := idler.Spec.TimeoutSeconds
146-
if isOwnedByVM(pod.ObjectMeta) {
147-
// use 1/12th of the timeout for VMs to have more aggressive idling to decrease
148-
// the infra costs because VMs consume much more resources
149-
timeoutSeconds = timeoutSeconds / 12
150-
}
155+
156+
timeoutSeconds := getTimeout(idler, pod)
151157
if pod.Status.StartTime != nil {
152158
// check the restart count for the pod
153159
restartCount := getHighestRestartCount(pod.Status)
@@ -167,6 +173,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1.
167173
// Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero.
168174
err := r.deletePodsAndCreateNotification(podCtx, pod, idler, ownerIdler)
169175
if err == nil {
176+
requeueAfter = shorterDuration(requeueAfter, time.Duration(float32(timeoutSeconds)*0.05)*time.Second)
170177
continue
171178
}
172179
idleErrors = append(idleErrors, err)

controllers/idler/idler_controller_test.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,8 @@ func TestEnsureIdling(t *testing.T) {
234234
HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated())
235235

236236
assert.True(t, res.Requeue)
237-
assertRequeueTimeInDelta(t, res.RequeueAfter, idler.Spec.TimeoutSeconds/24)
237+
// something was idled, expect the next reconcile in 5% of the timeout
238+
assertRequeueTimeInDelta(t, res.RequeueAfter, int32(float32(idler.Spec.TimeoutSeconds)*0.05/12))
238239

239240
t.Run("No pods. requeue after idler timeout.", func(t *testing.T) {
240241
//given
@@ -362,7 +363,8 @@ func TestEnsureIdling(t *testing.T) {
362363
//then
363364
require.NoError(t, err)
364365
assert.True(t, res.Requeue)
365-
assert.Equal(t, time.Duration(idler.Spec.TimeoutSeconds)*time.Second, res.RequeueAfter)
366+
// something was idled, expect the next reconcile in 5% of the timeout
367+
assert.Equal(t, time.Duration(int32(float32(idler.Spec.TimeoutSeconds)*0.05/12))*time.Second, res.RequeueAfter)
366368
memberoperatortest.AssertThatIdler(t, idler.Name, fakeClients).
367369
HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated())
368370
//check the notification is actually created
@@ -384,7 +386,8 @@ func TestEnsureIdling(t *testing.T) {
384386
//then
385387
require.NoError(t, err)
386388
assert.True(t, res.Requeue)
387-
assert.Equal(t, time.Duration(idler.Spec.TimeoutSeconds)*time.Second, res.RequeueAfter)
389+
// pods (exceeding the timeout) are still running, expect the next reconcile in 5% of the timeout
390+
assert.Equal(t, time.Duration(int32(float32(idler.Spec.TimeoutSeconds)*0.05/12))*time.Second, res.RequeueAfter)
388391
memberoperatortest.AssertThatIdler(t, idler.Name, fakeClients).
389392
HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated())
390393

@@ -606,7 +609,7 @@ func TestNotificationAppNameTypeForPods(t *testing.T) {
606609
for pt, tcs := range testpod {
607610
t.Run(pt, func(t *testing.T) {
608611
reconciler, _, fakeClients := prepareReconcile(t, idler.Name, getHostCluster, idler, nsTmplSet, mur)
609-
ownerIdler := newOwnerIdler(reconciler.DiscoveryClient, reconciler.DynamicClient, reconciler.ScalesClient, reconciler.RestClient)
612+
ownerIdler := newOwnerIdler(idler, reconciler)
610613
pod, appName := tcs.preparePayload(fakeClients)
611614

612615
// when

controllers/idler/owners.go

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,12 @@ package idler
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
7+
"time"
68

9+
toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
10+
corev1 "k8s.io/api/core/v1"
711
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
812
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
913
"k8s.io/apimachinery/pkg/runtime/schema"
@@ -16,55 +20,79 @@ import (
1620
)
1721

1822
type ownerIdler struct {
23+
idler *toolchainv1alpha1.Idler
1924
ownerFetcher *ownerFetcher
2025
dynamicClient dynamic.Interface
2126
scalesClient scale.ScalesGetter
2227
restClient rest.Interface
2328
}
2429

25-
func newOwnerIdler(discoveryClient discovery.ServerResourcesInterface, dynamicClient dynamic.Interface, scalesClient scale.ScalesGetter, restClient rest.Interface) *ownerIdler {
30+
func newOwnerIdler(idler *toolchainv1alpha1.Idler, reconciler *Reconciler) *ownerIdler {
2631
return &ownerIdler{
27-
ownerFetcher: newOwnerFetcher(discoveryClient, dynamicClient),
28-
dynamicClient: dynamicClient,
29-
scalesClient: scalesClient,
30-
restClient: restClient,
32+
idler: idler,
33+
ownerFetcher: newOwnerFetcher(reconciler.DiscoveryClient, reconciler.DynamicClient),
34+
dynamicClient: reconciler.DynamicClient,
35+
scalesClient: reconciler.ScalesClient,
36+
restClient: reconciler.RestClient,
3137
}
3238
}
3339

34-
// scaleOwnerToZero fetches the whole tree of the controller owners from the provided object (Deployment, ReplicaSet, etc).
40+
// scaleOwnerToZero fetches the whole tree of the controller owners from the provided pod.
3541
// If any known controller owner is found, then it's scaled down (or deleted) and its kind and name is returned.
42+
// If the pod has been running for longer than 105% of the idler timeout, it will also idle the second known owner.
43+
// This is a workaround for cases when the top owner controller fails to idle the workload. For example the AAP controller sometimes fails to scale down StatefulSets for postgres pods owned by the top AAP CR. Scaling down the StatefulSet (AAP -> StatefulSet -> Pods) mitigates that AAP controller bug.
3644
// Otherwise, returns empty strings.
37-
func (i *ownerIdler) scaleOwnerToZero(ctx context.Context, meta metav1.Object) (kind string, name string, err error) {
45+
func (i *ownerIdler) scaleOwnerToZero(ctx context.Context, pod *corev1.Pod) (string, string, error) {
3846
logger := log.FromContext(ctx)
39-
logger.Info("Scaling owner to zero", "name", meta.GetName())
47+
logger.Info("Scaling owner to zero")
4048

41-
owners, err := i.ownerFetcher.getOwners(ctx, meta)
49+
owners, err := i.ownerFetcher.getOwners(ctx, pod)
4250
if err != nil {
4351
logger.Error(err, "failed to find all owners, try to idle the workload with information that is available")
4452
}
53+
54+
var topOwnerKind, topOwnerName string
55+
var errToReturn error
4556
for _, ownerWithGVR := range owners {
4657
owner := ownerWithGVR.object
47-
kind = owner.GetObjectKind().GroupVersionKind().Kind
48-
name = owner.GetName()
49-
switch kind {
58+
ownerKind := owner.GetObjectKind().GroupVersionKind().Kind
59+
60+
switch ownerKind {
5061
case "Deployment", "ReplicaSet", "Integration", "KameletBinding", "StatefulSet", "ReplicationController":
5162
err = i.scaleToZero(ctx, ownerWithGVR)
52-
return
5363
case "DaemonSet", "Job":
5464
err = i.deleteResource(ctx, ownerWithGVR) // Nothing to scale down. Delete instead.
55-
return
5665
case "DeploymentConfig":
5766
err = i.scaleDeploymentConfigToZero(ctx, ownerWithGVR)
58-
return
5967
case "VirtualMachine":
6068
err = i.stopVirtualMachine(ctx, ownerWithGVR) // Nothing to scale down. Stop instead.
61-
return
6269
case "AnsibleAutomationPlatform":
6370
err = i.idleAAP(ctx, ownerWithGVR) // Nothing to scale down. Stop instead.
64-
return
71+
default:
72+
continue // Skip unknown owner types
73+
}
74+
75+
// Store the first processed owner's info and preserve its error
76+
if topOwnerKind == "" {
77+
topOwnerKind = ownerKind
78+
topOwnerName = owner.GetName()
79+
errToReturn = err
80+
} else {
81+
// if it's the second known owner being processed, then stop the loop
82+
errToReturn = errors.Join(errToReturn, err)
83+
break
84+
}
85+
86+
// If no error occurred and the pod doesn't run for longer than 105% of the idler timeout, return immediately after the first owner was idled
87+
timeoutSeconds := getTimeout(i.idler, *pod)
88+
if err == nil && !time.Now().After(pod.Status.StartTime.Add(time.Duration(float64(timeoutSeconds)*1.05)*time.Second)) {
89+
return topOwnerKind, topOwnerName, nil
6590
}
91+
logger.Info("Scaling the first known owner down either failed or the pod has been running for longer than 105% of the idler timeout. Scaling the next known owner.")
6692
}
67-
return "", "", nil
93+
94+
// Return the first processed owner's info (or empty if none were processed), and the list of errors (if any happened)
95+
return topOwnerKind, topOwnerName, errToReturn
6896
}
6997

7098
var supportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{

controllers/idler/owners_test.go

Lines changed: 87 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ import (
88
"slices"
99
"strings"
1010
"testing"
11+
"time"
1112

12-
"github.com/codeready-toolchain/api/api/v1alpha1"
13+
toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
1314
"github.com/codeready-toolchain/member-operator/pkg/apis"
1415
"github.com/codeready-toolchain/member-operator/test"
1516
testcommon "github.com/codeready-toolchain/toolchain-common/pkg/test"
@@ -191,7 +192,7 @@ var testConfigs = map[string]createTestConfigFunc{
191192
}
192193

193194
func TestAppNameTypeForControllers(t *testing.T) {
194-
setup := func(t *testing.T, createTestConfig createTestConfigFunc) (*ownerIdler, *test.FakeClientSet, payloadTestConfig, payloads, *corev1.Pod) {
195+
setup := func(t *testing.T, createTestConfig createTestConfigFunc, extraExceedTimeout bool) (*ownerIdler, *test.FakeClientSet, payloadTestConfig, payloads, *corev1.Pod) {
195196
dynamicClient := fakedynamic.NewSimpleDynamicClient(scheme.Scheme)
196197
fakeDiscovery := newFakeDiscoveryClient(allResourcesList(t)...)
197198
scalesClient := &fakescale.FakeScaleClient{}
@@ -202,9 +203,35 @@ func TestAppNameTypeForControllers(t *testing.T) {
202203
gock.OffAll()
203204
})
204205

205-
ownerIdler := newOwnerIdler(fakeDiscovery, dynamicClient, scalesClient, restClient)
206+
timeoutSeconds := int32(3600) // 1 hour
207+
idler := &toolchainv1alpha1.Idler{
208+
ObjectMeta: metav1.ObjectMeta{
209+
Name: "test-idler",
210+
},
211+
Spec: toolchainv1alpha1.IdlerSpec{
212+
TimeoutSeconds: timeoutSeconds,
213+
},
214+
}
215+
ownerIdler := &ownerIdler{
216+
idler: idler,
217+
ownerFetcher: newOwnerFetcher(fakeDiscovery, dynamicClient),
218+
dynamicClient: dynamicClient,
219+
scalesClient: scalesClient,
220+
restClient: restClient,
221+
}
206222

207-
plds := preparePayloads(t, &test.FakeClientSet{DynamicClient: dynamicClient, AllNamespacesClient: testcommon.NewFakeClient(t)}, "alex-stage", "", freshStartTimes(60))
223+
// Calculate start time based on whether timeout should be exceeded
224+
timeoutRatio := 1.01
225+
if extraExceedTimeout {
226+
timeoutRatio = 1.1
227+
}
228+
229+
startTimes := payloadStartTimes{
230+
defaultStartTime: time.Now().Add(-time.Duration(float64(timeoutSeconds)*timeoutRatio) * time.Second),
231+
vmStartTime: time.Now().Add(-time.Duration(float64(timeoutSeconds)/12*timeoutRatio) * time.Second),
232+
}
233+
234+
plds := preparePayloads(t, &test.FakeClientSet{DynamicClient: dynamicClient, AllNamespacesClient: testcommon.NewFakeClient(t)}, "alex-stage", "", startTimes)
208235
tc := createTestConfig(plds)
209236

210237
p := plds.getFirstControlledPod(tc.podOwnerName)
@@ -220,7 +247,7 @@ func TestAppNameTypeForControllers(t *testing.T) {
220247
for kind, createTestConfig := range testConfigs {
221248
t.Run(kind, func(t *testing.T) {
222249
//given
223-
ownerIdler, fakeClients, testConfig, plds, pod := setup(t, createTestConfig)
250+
ownerIdler, fakeClients, testConfig, plds, pod := setup(t, createTestConfig, false)
224251

225252
//when
226253
appType, appName, err := ownerIdler.scaleOwnerToZero(context.TODO(), pod)
@@ -236,6 +263,29 @@ func TestAppNameTypeForControllers(t *testing.T) {
236263
othersTCFunc(plds).ownerScaledUp(assertion)
237264
}
238265
}
266+
assertOtherOwners(t, ownerIdler, pod, false)
267+
})
268+
}
269+
})
270+
271+
t.Run("timeout exceeded - multiple owners processed", func(t *testing.T) {
272+
for kind, createTestConfig := range testConfigs {
273+
t.Run(kind, func(t *testing.T) {
274+
//given - pod running for more than 105% of timeout
275+
ownerIdler, fakeClients, testConfig, _, pod := setup(t, createTestConfig, true)
276+
277+
//when
278+
appType, appName, err := ownerIdler.scaleOwnerToZero(context.TODO(), pod)
279+
280+
//then
281+
require.NoError(t, err)
282+
require.Equal(t, kind, appType)
283+
require.Equal(t, testConfig.expectedAppName, appName)
284+
assertion := test.AssertThatInIdleableCluster(t, fakeClients)
285+
testConfig.ownerScaledDown(assertion)
286+
287+
// when there are multiple owners, then verify that the second one is scaled down too
288+
assertOtherOwners(t, ownerIdler, pod, true)
239289
})
240290
}
241291
})
@@ -244,7 +294,7 @@ func TestAppNameTypeForControllers(t *testing.T) {
244294
for kind, createTestConfig := range testConfigs {
245295
t.Run(kind, func(t *testing.T) {
246296
//given
247-
ownerIdler, fakeClients, testConfig, plds, pod := setup(t, createTestConfig)
297+
ownerIdler, fakeClients, testConfig, plds, pod := setup(t, createTestConfig, false)
248298
gock.OffAll()
249299
// mock stop call
250300
mockStopVMCalls(".*", ".*", http.StatusInternalServerError)
@@ -280,13 +330,14 @@ func TestAppNameTypeForControllers(t *testing.T) {
280330
othersTCFunc(plds).ownerScaledUp(assertion)
281331
}
282332
}
333+
assertOtherOwners(t, ownerIdler, pod, true)
283334
})
284335
}
285336
})
286337

287338
t.Run("error when getting owner deployment is ignored", func(t *testing.T) {
288339
// given
289-
ownerIdler, fakeClients, testConfig, plds, pod := setup(t, testConfigs["Deployment"])
340+
ownerIdler, fakeClients, testConfig, plds, pod := setup(t, testConfigs["Deployment"], false)
290341
reactionChain := fakeClients.DynamicClient.ReactionChain
291342
fakeClients.DynamicClient.PrependReactor("get", "deployments", func(action clienttest.Action) (handled bool, ret runtime.Object, err error) {
292343
return true, nil, errors.New("can't get deployment")
@@ -310,6 +361,34 @@ func TestAppNameTypeForControllers(t *testing.T) {
310361
})
311362
}
312363

364+
func assertOtherOwners(t *testing.T, ownerIdler *ownerIdler, pod *corev1.Pod, secondOwnerIdled bool) {
365+
owners, err := ownerIdler.ownerFetcher.getOwners(context.TODO(), pod)
366+
if !apierrors.IsNotFound(err) {
367+
require.NoError(t, err)
368+
}
369+
// if there are more owners than one
370+
if len(owners) > 1 {
371+
// by default, all other owners shouldn't be idled
372+
notIdledOwnersStartIndex := 1
373+
if secondOwnerIdled {
374+
// if the second owner is supposed to be idled
375+
assertReplicas(t, owners[1].object, 0)
376+
// then set the start index for all other owners not idled at 2
377+
notIdledOwnersStartIndex = 2
378+
}
379+
// check that all other owners are not idled
380+
for i := notIdledOwnersStartIndex; i < len(owners)-1; i++ {
381+
assertReplicas(t, owners[i].object, 3)
382+
}
383+
}
384+
}
385+
386+
func assertReplicas(t *testing.T, object *unstructured.Unstructured, expReplicas int64) {
387+
replicas, _, err := unstructured.NestedInt64(object.UnstructuredContent(), "spec", "replicas")
388+
require.NoError(t, err)
389+
require.Equal(t, expReplicas, replicas)
390+
}
391+
313392
func TestGetAPIResourceList(t *testing.T) {
314393
// given
315394
pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "test-namespace"}}
@@ -518,7 +597,7 @@ func TestGetOwnersFailures(t *testing.T) {
518597
t.Run("intermediate owner is returned", func(t *testing.T) {
519598
// given
520599
pod := givenPod.DeepCopy()
521-
idler := &v1alpha1.Idler{ObjectMeta: metav1.ObjectMeta{Name: "test-rc", Namespace: "test-namespace"}}
600+
idler := &toolchainv1alpha1.Idler{ObjectMeta: metav1.ObjectMeta{Name: "test-rc", Namespace: "test-namespace"}}
522601
require.NoError(t, controllerruntime.SetControllerReference(idler, pod, scheme.Scheme))
523602
require.NoError(t, controllerruntime.SetControllerReference(ownerObject, idler, scheme.Scheme))
524603
// when it's supposed to be "not found" then do not include it in the client

0 commit comments

Comments
 (0)