Skip to content

Commit 98a6a0f

Browse files
authored
Ensure Synced Condition on DB instance/cluster (#45)
Adds tests that the ACK.ResourceSynced Condition is present on resources of kind DBCluster and DBInstance and that the status of those conditions are either False or True depending on the expected state transition of the DB instance or cluster. This involved creating some helper functions in test/e2e/condition.py that assert that a Condition of a specific type and status is present in a resource's Status.Conditions collection. In grep'ing through logs for the rds-controller, I noticed that the AvailabilityZone field for the DBInstance should be late-initialized and so added that to the generator.yaml file. Finally, found that ACK.ResourceSynced/True was not being set on DBCluster and DBInstance resources when the status of the instance or cluster made its way to 'available'. I added an else block to the generic hook code templates to ensure that ACK.ResourceSynced condition with a status of True is present when the resource's status transitions to 'available'. Signed-off-by: Jay Pipes <[email protected]> By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent 08d0c87 commit 98a6a0f

File tree

10 files changed

+218
-14
lines changed

10 files changed

+218
-14
lines changed

generator.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ resources:
144144
- DBSubnetGroupNotFoundFault
145145
- DBParameterGroupNotFound
146146
fields:
147+
AvailabilityZone:
148+
late_initialize: {}
147149
DBInstanceIdentifier:
148150
is_primary_key: true
149151
MasterUserPassword:

pkg/resource/db_cluster/sdk.go

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/resource/db_instance/manager.go

Lines changed: 7 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/resource/db_instance/sdk.go

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

templates/hooks/db_cluster/sdk_read_many_post_set_output.go.tpl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Setting resource synced condition to false will trigger a requeue of
33
// the resource. No need to return a requeue error here.
44
setSyncedCondition(&resource{ko}, corev1.ConditionFalse, nil, nil)
5-
return &resource{ko}, nil
5+
} else {
6+
setSyncedCondition(&resource{ko}, corev1.ConditionTrue, nil, nil)
67
}
78

templates/hooks/db_instance/sdk_read_many_post_set_output.go.tpl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,7 @@
22
// Setting resource synced condition to false will trigger a requeue of
33
// the resource. No need to return a requeue error here.
44
setSyncedCondition(&resource{ko}, corev1.ConditionFalse, nil, nil)
5-
return &resource{ko}, nil
6-
}
5+
} else {
6+
setSyncedCondition(&resource{ko}, corev1.ConditionTrue, nil, nil)
7+
}
8+

templates/hooks/db_instance/sdk_update_pre_build_request.go.tpl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,8 @@
1414
setSyncedCondition(desired, corev1.ConditionTrue, nil, nil)
1515
return desired, nil
1616
}
17+
if !instanceAvailable(latest) {
18+
msg := "DB instance cannot be modifed while in '" + *latest.ko.Status.DBInstanceStatus + "' status"
19+
setSyncedCondition(desired, corev1.ConditionFalse, &msg, nil)
20+
return desired, requeueWaitUntilCanModify(latest)
21+
}

test/e2e/condition.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
# not use this file except in compliance with the License. A copy of the
5+
# License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is distributed
10+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
# express or implied. See the License for the specific language governing
12+
# permissions and limitations under the License.
13+
14+
"""Utility functions to help processing Kubernetes resource conditions"""
15+
16+
# TODO(jaypipes): Move these functions to acktest library. The reason these are
17+
# here is because the existing k8s.assert_condition_state_message doesn't
18+
# actually assert anything. It returns true or false and logs messages.
19+
20+
import pytest
21+
22+
from acktest.k8s import resource as k8s
23+
24+
CONDITION_TYPE_ADOPTED = "ACK.Adopted"
25+
CONDITION_TYPE_RESOURCE_SYNCED = "ACK.ResourceSynced"
26+
CONDITION_TYPE_TERMINAL = "ACK.Terminal"
27+
CONDITION_TYPE_RECOVERABLE = "ACK.Recoverable"
28+
CONDITION_TYPE_ADVISORY = "ACK.Advisory"
29+
CONDITION_TYPE_LATE_INITIALIZED = "ACK.LateInitialized"
30+
31+
32+
def assert_type_status(
33+
ref: k8s.CustomResourceReference,
34+
cond_type_match: str = CONDITION_TYPE_RESOURCE_SYNCED,
35+
cond_status_match: bool = True,
36+
):
37+
"""Asserts that the supplied resource has a condition of type
38+
ACK.ResourceSynced and that the Status of this condition is True.
39+
40+
Usage:
41+
from acktest.k8s import resource as k8s
42+
43+
from e2e import condition
44+
45+
ref = k8s.CustomResourceReference(
46+
CRD_GROUP, CRD_VERSION, RESOURCE_PLURAL,
47+
db_cluster_id, namespace="default",
48+
)
49+
k8s.create_custom_resource(ref, resource_data)
50+
k8s.wait_resource_consumed_by_controller(ref)
51+
condition.assert_type_status(
52+
ref,
53+
condition.CONDITION_TYPE_RESOURCE_SYNCED,
54+
False)
55+
56+
Raises:
57+
pytest.fail when condition of the specified type is not found or is not
58+
in the supplied status.
59+
"""
60+
cond = k8s.get_resource_condition(ref, cond_type_match)
61+
if cond is None:
62+
msg = (f"Failed to find {cond_type_match} condition in "
63+
f"resource {ref}")
64+
pytest.fail(msg)
65+
66+
cond_status = cond.get('status', None)
67+
if str(cond_status) != str(cond_status_match):
68+
msg = (f"Expected {cond_type_match} condition to "
69+
f"have status {cond_status_match} but found {cond_status}")
70+
pytest.fail(msg)
71+
72+
73+
def assert_synced_status(
74+
ref: k8s.CustomResourceReference,
75+
cond_status_match: bool,
76+
):
77+
"""Asserts that the supplied resource has a condition of type
78+
ACK.ResourceSynced and that the Status of this condition is True.
79+
80+
Usage:
81+
from acktest.k8s import resource as k8s
82+
83+
from e2e import condition
84+
85+
ref = k8s.CustomResourceReference(
86+
CRD_GROUP, CRD_VERSION, RESOURCE_PLURAL,
87+
db_cluster_id, namespace="default",
88+
)
89+
k8s.create_custom_resource(ref, resource_data)
90+
k8s.wait_resource_consumed_by_controller(ref)
91+
condition.assert_synced_status(ref, False)
92+
93+
Raises:
94+
pytest.fail when ACK.ResourceSynced condition is not found or is not in
95+
a True status.
96+
"""
97+
assert_type_status(ref, CONDITION_TYPE_RESOURCE_SYNCED, cond_status_match)
98+
99+
100+
def assert_synced(ref: k8s.CustomResourceReference):
101+
"""Asserts that the supplied resource has a condition of type
102+
ACK.ResourceSynced and that the Status of this condition is True.
103+
104+
Usage:
105+
from acktest.k8s import resource as k8s
106+
107+
from e2e import condition
108+
109+
ref = k8s.CustomResourceReference(
110+
CRD_GROUP, CRD_VERSION, RESOURCE_PLURAL,
111+
db_cluster_id, namespace="default",
112+
)
113+
k8s.create_custom_resource(ref, resource_data)
114+
k8s.wait_resource_consumed_by_controller(ref)
115+
condition.assert_synced(ref)
116+
117+
Raises:
118+
pytest.fail when ACK.ResourceSynced condition is not found or is not in
119+
a True status.
120+
"""
121+
return assert_synced_status(ref, True)
122+
123+
124+
def assert_not_synced(ref: k8s.CustomResourceReference):
125+
"""Asserts that the supplied resource has a condition of type
126+
ACK.ResourceSynced and that the Status of this condition is False.
127+
128+
Usage:
129+
from acktest.k8s import resource as k8s
130+
131+
from e2e import condition
132+
133+
ref = k8s.CustomResourceReference(
134+
CRD_GROUP, CRD_VERSION, RESOURCE_PLURAL,
135+
db_cluster_id, namespace="default",
136+
)
137+
k8s.create_custom_resource(ref, resource_data)
138+
k8s.wait_resource_consumed_by_controller(ref)
139+
condition.assert_not_synced(ref)
140+
141+
Raises:
142+
pytest.fail when ACK.ResourceSynced condition is not found or is not in
143+
a False status.
144+
"""
145+
return assert_synced_status(ref, False)

test/e2e/tests/test_db_cluster.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,32 @@
2222
from e2e import service_marker, CRD_GROUP, CRD_VERSION, load_rds_resource
2323
from e2e.replacement_values import REPLACEMENT_VALUES
2424
from e2e.bootstrap_resources import get_bootstrap_resources
25-
from e2e.fixtures import k8s_secret
25+
from e2e import condition
2626
from e2e import db_cluster
27+
from e2e.fixtures import k8s_secret
2728

2829
RESOURCE_PLURAL = 'dbclusters'
2930

3031
DELETE_WAIT_AFTER_SECONDS = 120
3132

3233
# Time we wait after resource becoming available in RDS and checking the CR's
33-
# Status has been updated
34-
CHECK_STATUS_WAIT_SECONDS = 20
34+
# Status has been updated.
35+
#
36+
# NOTE(jaypipes): I have witnessed DB clusters transition from creating ->
37+
# available -> creating again :(
38+
#
39+
# This is rare, but seems to be when RDS schedules a snapshot shortly after a
40+
# DB cluster is created. The cluster gets to 'available', then RDS pauses the
41+
# DB cluster, does some sort of snapshotting operation, then unpauses the DB
42+
# cluster. It seems that during this unpausing operation, the DB cluster's
43+
# status gets set back to 'creating' again.
44+
#
45+
# Unfortunately, because Aurora serverless doesn't emit Events for creation and
46+
# deletion like it does for DB instances, this is difficult, if not impossible,
47+
# to gather data on. Setting the check wait seconds here to 2 minutes since I
48+
# believe this whole "pause-then-snapshot-then-unpause" thing only takes about
49+
# a minute until the DB cluster status settles into an 'available' state.
50+
CHECK_STATUS_WAIT_SECONDS = 60*2
3551

3652
MODIFY_WAIT_AFTER_SECONDS = 20
3753

@@ -83,6 +99,7 @@ def test_crud_mysql_serverless(
8399
assert 'status' in cr
84100
assert 'status' in cr['status']
85101
assert cr['status']['status'] == 'creating'
102+
condition.assert_not_synced(ref)
86103

87104
db_cluster.wait_until(
88105
db_cluster_id,
@@ -104,6 +121,7 @@ def test_crud_mysql_serverless(
104121
assert 'status' in cr
105122
assert 'status' in cr['status']
106123
assert cr['status']['status'] != 'creating'
124+
condition.assert_synced(ref)
107125

108126
# We're now going to modify the CopyTagsToSnapshot field of the DB
109127
# instance, wait some time and verify that the RDS server-side resource

test/e2e/tests/test_db_instance.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,33 @@
2222
from e2e import service_marker, CRD_GROUP, CRD_VERSION, load_rds_resource
2323
from e2e.replacement_values import REPLACEMENT_VALUES
2424
from e2e.bootstrap_resources import get_bootstrap_resources
25-
from e2e.fixtures import k8s_secret
25+
from e2e import condition
2626
from e2e import db_instance
27+
from e2e.fixtures import k8s_secret
2728

2829
RESOURCE_PLURAL = 'dbinstances'
2930

30-
DELETE_WAIT_AFTER_SECONDS = 120
31+
DELETE_WAIT_AFTER_SECONDS = 60*2
3132

3233
# Time we wait after resource becoming available in RDS and checking the CR's
33-
# Status has been updated
34-
CHECK_STATUS_WAIT_SECONDS = 20
34+
# Status has been updated.
35+
#
36+
# NOTE(jaypipes): RDS does an automated backup as soon as a DB instance is
37+
# created. This automated backup can take 2-3 minutes, during which time the DB
38+
# instance's status will be 'backing-up'. In addition, I have noticed that
39+
# sometimes RDS will reset master credentials after doing the initial snapshot
40+
# backup, and this involves restarting the DB instance. This itself can take an
41+
# additional 2-3 minutes.
42+
#
43+
# What this means is that the DB instance goes through the following status
44+
# transitions:
45+
#
46+
# creating -> available -> backing-up -> available ->
47+
# resetting-master-credentials -> restarting -> available
48+
#
49+
# This can take upwards of 7 minutes for the the DB instance to reach that
50+
# "final" available state
51+
CHECK_STATUS_WAIT_SECONDS = 60*8
3552

3653
MODIFY_WAIT_AFTER_SECONDS = 20
3754

@@ -82,6 +99,7 @@ def test_crud_postgres13_t3_micro(
8299
assert 'status' in cr
83100
assert 'dbInstanceStatus' in cr['status']
84101
assert cr['status']['dbInstanceStatus'] == 'creating'
102+
condition.assert_not_synced(ref)
85103

86104
db_instance.wait_until(
87105
db_instance_id,
@@ -103,6 +121,7 @@ def test_crud_postgres13_t3_micro(
103121
assert 'status' in cr
104122
assert 'dbInstanceStatus' in cr['status']
105123
assert cr['status']['dbInstanceStatus'] != 'creating'
124+
condition.assert_synced(ref)
106125

107126
# We're now going to modify the CopyTagsToSnapshot field of the DB
108127
# instance, wait some time and verify that the RDS server-side resource

0 commit comments

Comments
 (0)