Skip to content

Commit 3047b8a

Browse files
committed
test: enhance smoke test suite reliability and add comprehensive checks
Major improvements to the ClickHouse Helm chart test suite addressing timing issues, adding new test scenarios, and improving robustness. ## Timing and Reliability Fixes - Increase timeout for system.clusters verification (60s → 180s) Allows proper cluster topology propagation after deployment - Add retry logic for ON CLUSTER queries during upgrades Wait for cluster config propagation before distributed operations - Add service endpoints verification with retry (60s timeout) Ensure all endpoints register after pod creation - Fix PVC verification to only check mounted volumes Prevents false failures from orphaned PVCs (reclaimPolicy: Retain) Affects: verify_clickhouse_pvc_size, verify_log_persistence, verify_keeper_storage ## New Test Features - Add data survival verification for upgrade scenarios Creates test table before upgrade, verifies data after Smart detection: only runs for in-place upgrades (same nameOverride) Skips cluster replacement scenarios with informative note - Add Keeper high availability chaos test Deletes one Keeper pod from 3-node quorum Verifies ClickHouse remains writable with 2/3 Keepers Runs automatically for replicated deployments - Add metrics endpoint verification Tests operator metrics endpoint accessibility Validates Prometheus format output ## Robustness Improvements - Implement get_ready_clickhouse_pod() helper Selects first Running pod instead of blindly using pods[0] Eliminates race conditions during pod initialization Applied to: verify_clickhouse_version, verify_user_connection, verify_extra_config_values - Refactor get_cluster_topology() to use JSON parsing Changed from FORMAT TabSeparated to FORMAT JSON Adds proper error handling with json.JSONDecodeError Type-safe field access with .get() methods - Refactor verify_metrics_endpoint() to use operator pod Access via localhost from operator pod (more reliable) Implement curl/wget fallback for tool availability Removes dependency on ClickHouse pod having network tools - Add get_operator_pod() helper function Finds operator pod (excludes CHI pods) Used for metrics endpoint testing ## Warning Cleanup - Fix 'level' setting warning in extraConfig parsing Add 'level' to skip list (nested logger element) - Improve secrets verification messaging Change warning to informational for inline credentials Acknowledges inline config is valid for smoke tests - Remove non-existent upgrade scenario from test list Cleanup UPGRADE_SCENARIOS list ## New Helper Functions tests/steps/clickhouse.py: - get_ready_clickhouse_pod() - Select first Running pod - get_operator_pod() - Find operator pod - create_test_data() - Create test data for upgrade verification - verify_data_survival() - Verify data survived upgrade - test_keeper_high_availability() - Chaos test for Keeper quorum - verify_metrics_endpoint() - Test metrics endpoint (refactored) tests/steps/kubernetes.py: - delete_pod() - Delete a Kubernetes pod ## Files Modified - tests/scenarios/smoke.py - tests/steps/clickhouse.py - tests/steps/deployment.py - tests/steps/kubernetes.py Fixes timing issues, adds production-ready test scenarios, and improves overall test suite reliability and maintainability.
1 parent 92b29dc commit 3047b8a

File tree

3 files changed

+461
-85
lines changed

3 files changed

+461
-85
lines changed

tests/scenarios/smoke.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import tests.steps.kubernetes as kubernetes
55
import tests.steps.minikube as minikube
66
import tests.steps.helm as helm
7+
import tests.steps.clickhouse as clickhouse
78
from tests.steps.deployment import HelmState
89

910

@@ -17,7 +18,6 @@
1718

1819
UPGRADE_SCENARIOS = [
1920
("fixtures/upgrade/initial.yaml", "fixtures/upgrade/upgrade.yaml"),
20-
("fixtures/upgrade/simple-initial.yaml", "fixtures/upgrade/complex-upgraded.yaml"),
2121
]
2222

2323

@@ -56,6 +56,19 @@ def check_deployment(self, fixture_file, skip_external_keeper=True):
5656

5757
with Then("verify deployment state"):
5858
state.verify_all(namespace=namespace)
59+
60+
# Add Keeper HA test for replicated deployments with 3+ keepers
61+
if "replicated" in fixture_name:
62+
with And("test Keeper high availability (chaos test)"):
63+
admin_password = state.clickhouse_config.get("defaultUser", {}).get("password", "")
64+
clickhouse.test_keeper_high_availability(
65+
namespace=namespace,
66+
admin_password=admin_password
67+
)
68+
69+
# Verify metrics endpoint is accessible
70+
with And("verify metrics endpoint"):
71+
clickhouse.verify_metrics_endpoint(namespace=namespace)
5972

6073
with Finally("cleanup deployment"):
6174
helm.uninstall(namespace=namespace, release_name=release_name)
@@ -92,6 +105,23 @@ def check_upgrade(self, initial_fixture, upgrade_fixture):
92105

93106
with Then("verify initial deployment state"):
94107
initial_state.verify_all(namespace=namespace)
108+
109+
# Only test data survival if nameOverride stays the same (in-place upgrade)
110+
initial_name = initial_state.values.get("nameOverride", "")
111+
upgrade_name = upgrade_state.values.get("nameOverride", "")
112+
is_inplace_upgrade = (initial_name == upgrade_name)
113+
114+
if is_inplace_upgrade:
115+
with And("create test data for upgrade survival verification"):
116+
admin_password = initial_state.clickhouse_config.get("defaultUser", {}).get("password", "")
117+
clickhouse.create_test_data(
118+
namespace=namespace,
119+
admin_password=admin_password,
120+
table_name="pre_upgrade_data",
121+
test_value=f"upgrade_survival_{namespace}"
122+
)
123+
else:
124+
note(f"Skipping data survival test: nameOverride changed from '{initial_name}' to '{upgrade_name}' (cluster replacement scenario)")
95125

96126
with When("upgrade ClickHouse to new configuration"):
97127
helm.upgrade(
@@ -100,6 +130,21 @@ def check_upgrade(self, initial_fixture, upgrade_fixture):
100130

101131
with Then("verify upgraded deployment state"):
102132
upgrade_state.verify_all(namespace=namespace)
133+
134+
if is_inplace_upgrade:
135+
with And("verify data survived the upgrade"):
136+
admin_password = upgrade_state.clickhouse_config.get("defaultUser", {}).get("password", "")
137+
clickhouse.verify_data_survival(
138+
namespace=namespace,
139+
admin_password=admin_password,
140+
table_name="pre_upgrade_data",
141+
expected_value=f"upgrade_survival_{namespace}"
142+
)
143+
else:
144+
note(f"Data survival verification skipped for cluster replacement scenario")
145+
146+
with And("verify metrics endpoint"):
147+
clickhouse.verify_metrics_endpoint(namespace=namespace)
103148

104149
with Finally("cleanup deployment"):
105150
helm.uninstall(namespace=namespace, release_name=release_name)
@@ -139,5 +184,5 @@ def feature(self):
139184
kubernetes.use_context(context_name="minikube")
140185

141186
Feature(run=check_all_fixtures)
142-
#
143-
# Feature(run=check_all_upgrades)
187+
188+
Feature(run=check_all_upgrades)

0 commit comments

Comments
 (0)