From 3febd29e4a5bd1616d03c80dd023523dc0cd31bb Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Fri, 20 Jun 2025 15:00:43 +0200 Subject: [PATCH 1/2] Fix errors in KUTTL tests when running from the top-most directory . Improve the teardown of every test, so that KUTTL can run the tests in a random order without causing errors due to unexpected resource state. . Improve account and database creation tests so that they can be run from the top-most directory without causing KUTTL errors. . Also remove a test that expects the mariadb-operator runs in a pod on a dedicated namespace. This test doesn't add much coverage and removing it greatly simplifies testing locally during development or CI failure analysis. --- .../common/assert_sample_deployment.yaml | 26 ------------------- .../kuttl/tests/account_create/05-assert.yaml | 2 +- .../kuttl/tests/account_create/07-assert.yaml | 2 +- .../tests/database_create/03-assert.yaml | 4 +-- .../galera_cluster_restart/04-teardown.yaml | 13 ++++++++++ .../tests/galera_deploy_tls/03-teardown.yaml | 3 --- .../tests/galera_log_to_disk/03-teardown.yaml | 10 +++++++ 7 files changed, 27 insertions(+), 33 deletions(-) create mode 100644 tests/kuttl/tests/galera_cluster_restart/04-teardown.yaml create mode 100644 tests/kuttl/tests/galera_log_to_disk/03-teardown.yaml diff --git a/tests/kuttl/common/assert_sample_deployment.yaml b/tests/kuttl/common/assert_sample_deployment.yaml index a6080194..251435c6 100644 --- a/tests/kuttl/common/assert_sample_deployment.yaml +++ b/tests/kuttl/common/assert_sample_deployment.yaml @@ -130,29 +130,3 @@ apiVersion: v1 kind: ConfigMap metadata: name: openstack-config-data ---- -apiVersion: kuttl.dev/v1beta1 -kind: TestAssert -commands: - - script: | - # when using image digests the containerImage URLs are SHA's so we verify them with a script - tupleTemplate='{{ range (index .spec.template.spec.containers 1).env }}{{ .name }}{{ "#" }}{{ .value}}{{"\n"}}{{ end }}' - imageTuples=$(oc get -n openstack-operators deployment mariadb-operator-controller-manager -o go-template="$tupleTemplate") - # format of imageTuple is: RELATED_IMAGE_MARIADB_# separated by newlines - for ITEM in $(echo $imageTuples); do - # it is an image - if echo $ITEM | grep 'RELATED_IMAGE' &> /dev/null; then - NAME=$(echo $ITEM | sed -e 's|^RELATED_IMAGE_MARIADB_\([^_]*\)_.*|\1|') - IMG_FROM_ENV=$(echo $ITEM | sed -e 's|^.*#\(.*\)|\1|') - template='{{ (index .spec.template.spec.containers 0).image }}' - case $NAME in - IMAGE) - STATEFUL_SET_IMG=$(oc get -n $NAMESPACE statefulset openstack-galera -o go-template="$template") - ;; - esac - if [ "$STATEFUL_SET_IMG" != "$IMG_FROM_ENV" ]; then - echo "$NAME image does not equal $VALUE" - exit 1 - fi - fi - done diff --git a/tests/kuttl/tests/account_create/05-assert.yaml b/tests/kuttl/tests/account_create/05-assert.yaml index 8110a94f..6e1c152b 100644 --- a/tests/kuttl/tests/account_create/05-assert.yaml +++ b/tests/kuttl/tests/account_create/05-assert.yaml @@ -4,7 +4,7 @@ kind: TestAssert commands: - script: | set -e - ${MARIADB_KUTTL_DIR:-tests/kuttl/tests}/../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_accounttest someuser dbsecret1 + ../../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_accounttest someuser dbsecret1 # ensure db users are configured without TLS connection restriction oc rsh -n ${NAMESPACE} -c galera openstack-galera-0 /bin/sh -c 'mysql -uroot -p${DB_ROOT_PASSWORD} -e "show grants for \`someuser\`@\`%\`;"' | grep 'GRANT USAGE' | grep -v 'REQUIRE SSL' --- diff --git a/tests/kuttl/tests/account_create/07-assert.yaml b/tests/kuttl/tests/account_create/07-assert.yaml index 25bc2004..93a821af 100644 --- a/tests/kuttl/tests/account_create/07-assert.yaml +++ b/tests/kuttl/tests/account_create/07-assert.yaml @@ -3,4 +3,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestAssert commands: - script: | - ${MARIADB_KUTTL_DIR:-tests/kuttl/tests}/../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_accounttest someuser dbsecret1 --reverse + ../../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_accounttest someuser dbsecret1 --reverse diff --git a/tests/kuttl/tests/database_create/03-assert.yaml b/tests/kuttl/tests/database_create/03-assert.yaml index 56b4cd28..ae1d87b6 100644 --- a/tests/kuttl/tests/database_create/03-assert.yaml +++ b/tests/kuttl/tests/database_create/03-assert.yaml @@ -13,8 +13,8 @@ commands: # for legacy secret non-present, test that a mariadb username was *not* made - script: | set -euxo pipefail - ${MARIADB_KUTTL_DIR:-tests/kuttl/tests}/../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_utf8 kuttldb_utf_8 12345678 --reverse + ../../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_utf8 kuttldb_utf_8 12345678 --reverse # for legacy secret present, test that a mariadb username was made - script: | set -euxo pipefail - ${MARIADB_KUTTL_DIR:-tests/kuttl/tests}/../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_legacy_secret kuttldb_legacy_secret dbsecret1 + ../../common/scripts/check_db_account.sh openstack-galera-0 kuttldb_legacy_secret kuttldb_legacy_secret dbsecret1 diff --git a/tests/kuttl/tests/galera_cluster_restart/04-teardown.yaml b/tests/kuttl/tests/galera_cluster_restart/04-teardown.yaml new file mode 100644 index 00000000..1a0af63c --- /dev/null +++ b/tests/kuttl/tests/galera_cluster_restart/04-teardown.yaml @@ -0,0 +1,13 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +delete: + - apiVersion: mariadb.openstack.org/v1beta1 + kind: Galera + name: openstack + - apiVersion: v1 + kind: Secret + name: kuttl-galera-tls +commands: + - script: | + oc delete -n $NAMESPACE pvc mysql-db-openstack-galera-0 mysql-db-openstack-galera-1 mysql-db-openstack-galera-2 + for i in `oc get pv | awk '/mysql-db.*galera/ {print $1}'`; do oc patch pv $i -p '{"spec":{"claimRef": null}}'; done diff --git a/tests/kuttl/tests/galera_deploy_tls/03-teardown.yaml b/tests/kuttl/tests/galera_deploy_tls/03-teardown.yaml index 68c7f897..1a0af63c 100644 --- a/tests/kuttl/tests/galera_deploy_tls/03-teardown.yaml +++ b/tests/kuttl/tests/galera_deploy_tls/03-teardown.yaml @@ -7,9 +7,6 @@ delete: - apiVersion: v1 kind: Secret name: kuttl-galera-tls - - apiVersion: v1 - kind: Secret - name: kuttldb-secret commands: - script: | oc delete -n $NAMESPACE pvc mysql-db-openstack-galera-0 mysql-db-openstack-galera-1 mysql-db-openstack-galera-2 diff --git a/tests/kuttl/tests/galera_log_to_disk/03-teardown.yaml b/tests/kuttl/tests/galera_log_to_disk/03-teardown.yaml new file mode 100644 index 00000000..ce36b5dc --- /dev/null +++ b/tests/kuttl/tests/galera_log_to_disk/03-teardown.yaml @@ -0,0 +1,10 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +delete: + - apiVersion: mariadb.openstack.org/v1beta1 + kind: Galera + name: openstack +commands: + - script: | + oc delete -n $NAMESPACE pvc mysql-db-openstack-galera-0 mysql-db-openstack-galera-1 mysql-db-openstack-galera-2 + for i in `oc get pv | awk '/mysql-db.*galera/ {print $1}'`; do oc patch pv $i -p '{"spec":{"claimRef": null}}'; done From 09df39e818da497a2849e8f396afa7adb36b1267 Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Thu, 19 Jun 2025 16:52:41 +0200 Subject: [PATCH 2/2] Rework retry/timeout defaults to ensure fast service failover When the galera pod that receives database traffic becomes unresponsible, the galera library reacts by running a script in one of the surviving pod to elect a new endpoint. This script uses curl to call the API server to update the selector object responsible for balancing database traffic. If during the API call the API server becomes unresponsive/unreacheable (e.g. the API VIP fails over to another master node), the curl call might get stuck for an unbounded period of time, which delays the traffic failover and can cause a long database service disruption. Add a default connect timeout and update default retry parameters so that curl is never blocked for too long, and the endpoint configuration can be retried until the API server becomes available. This commit only improves the default parameters, the ability to override those parameters will be addressed in a subsequent commit. Jira: OSPRH-17604 --- templates/galera/bin/mysql_wsrep_notify.sh | 19 ++++++++++++------- .../chainsaw/tests/service/chainsaw-test.yaml | 7 +++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/templates/galera/bin/mysql_wsrep_notify.sh b/templates/galera/bin/mysql_wsrep_notify.sh index 4658f419..08b0f772 100755 --- a/templates/galera/bin/mysql_wsrep_notify.sh +++ b/templates/galera/bin/mysql_wsrep_notify.sh @@ -11,9 +11,14 @@ NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) TOKEN=$(cat ${SERVICEACCOUNT}/token) CACERT=${SERVICEACCOUNT}/ca.crt -# Retry config -RETRIES=6 -WAIT=1 +# OSPRH-17604: use default timeout and retry parameters for fast failover +# default parameters for curl calls to the API server +: ${WSREP_NOTIFY_CURL_CONNECT_TIMEOUT:=5} +: ${WSREP_NOTIFY_CURL_MAX_TIME:=30} +CURL="curl --connect-timeout ${WSREP_NOTIFY_CURL_CONNECT_TIMEOUT} --max-time ${WSREP_NOTIFY_CURL_MAX_TIME}" +# defaults parameters for retry on error +: ${WSREP_NOTIFY_RETRIES:=30} +: ${WSREP_NOTIFY_RETRY_WAIT:=1} ## @@ -66,7 +71,7 @@ function api_server { request="$request -d @-" fi local output - output=$(curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" --request $request ${APISERVER}/api/v1/namespaces/${NAMESPACE}/services/${service}) + output=$(${CURL} -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" --request $request ${APISERVER}/api/v1/namespaces/${NAMESPACE}/services/${service}) local rc=$? if [ $rc != 0 ]; then @@ -109,8 +114,8 @@ function parse_output { # Generic retry logic for an action function function retry { local action=$1 - local retries=$RETRIES - local wait=$WAIT + local retries=$WSREP_NOTIFY_RETRIES + local wait=$WSREP_NOTIFY_RETRY_WAIT local rc=1 $action @@ -132,7 +137,7 @@ function retry { mysql_probe_state reprobe done if [ $rc -ne 0 ]; then - log_error "Could not run action after ${RETRIES} tries. Stop retrying." + log_error "Could not run action after ${WSREP_NOTIFY_RETRIES} tries. Stop retrying." fi return $rc } diff --git a/tests/chainsaw/tests/service/chainsaw-test.yaml b/tests/chainsaw/tests/service/chainsaw-test.yaml index ebf8dcfc..9d05c45d 100644 --- a/tests/chainsaw/tests/service/chainsaw-test.yaml +++ b/tests/chainsaw/tests/service/chainsaw-test.yaml @@ -74,6 +74,11 @@ spec: check: # we dont want "ERROR 1047 (08S01) at line 495: WSREP has not yet prepared node for application use" (find_first($stdout,'(08S01)') == NULL): true + catch: &catch_logs + - script: + content: | + # get full logs for all pods except copy logs from kolla start + oc logs -n $NAMESPACE --prefix=true --tail=-1 -l galera/name=openstack | grep -v -e ' INFO:' - name: Service failover on pod crash description: Check that service is failing over when the current endpoint pod crashes @@ -97,6 +102,7 @@ spec: check: ($stdout != $endpoint): true - script: *no_wsrep_in_failover_check + catch: *catch_logs - name: No failover on random pod restart description: Check that service is not impacted when a pod that is not the current endpoint is stopped @@ -114,3 +120,4 @@ spec: check: ($stdout == $endpoint): true - script: *no_wsrep_in_failover_check + catch: *catch_logs