Skip to content

Commit 67cbd56

Browse files
committed
Merge branch 'main' into trial/coredns-rock
2 parents 2f84b0a + 545699f commit 67cbd56

File tree

15 files changed

+1686
-383
lines changed

15 files changed

+1686
-383
lines changed

.github/ARCHITECTURE.md

Lines changed: 289 additions & 91 deletions
Large diffs are not rendered by default.

.github/workflows/test-baseline-clusters.yml

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -133,30 +133,7 @@ jobs:
133133
- name: Verify Cluster Health
134134
shell: bash
135135
run: |
136-
echo "Verifying cluster health..."
137-
138-
# For no-CNI manifests, skip node ready check (nodes will be NotReady without CNI)
139-
if [[ "${{ matrix.manifest-file }}" == *"no-cni"* ]]; then
140-
echo "⚠️ Skipping node ready check for no-CNI manifest (expected behavior)"
141-
echo "Verifying control plane components are running..."
142-
143-
# Wait for core system pods to be running
144-
kubectl wait --for=condition=Ready pod -l component=kube-apiserver -n kube-system --timeout=5m || true
145-
kubectl wait --for=condition=Ready pod -l component=etcd -n kube-system --timeout=5m || true
146-
kubectl wait --for=condition=Ready pod -l component=kube-controller-manager -n kube-system --timeout=5m || true
147-
kubectl wait --for=condition=Ready pod -l component=kube-scheduler -n kube-system --timeout=5m || true
148-
else
149-
echo "Waiting for nodes to be Ready..."
150-
kubectl wait --for=condition=Ready node --all --timeout=5m
151-
kubectl wait --for=condition=Ready pod --all -n kube-system --timeout=5m
152-
fi
153-
154-
# Get cluster info
155-
kubectl cluster-info
156-
kubectl get nodes -o wide
157-
kubectl get pods -A -o wide
158-
159-
echo "✅ Cluster is healthy"
136+
kube-galaxy status --wait --timeout 300
160137
161138
- name: Run Spread Tests
162139
id: run-tests

src/kube_galaxy/cli.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,21 @@ def setup_cmd(
9292

9393

9494
@app.command(name="status")
95-
def status_cmd() -> None:
96-
"""Display project status and dependency information."""
97-
status.status()
95+
def status_cmd(
96+
wait: bool = typer.Option(
97+
False,
98+
"--wait",
99+
help="Wait for cluster nodes and kube-system pods to become Ready",
100+
),
101+
timeout: int = typer.Option(
102+
300,
103+
"--timeout",
104+
min=1,
105+
help="Readiness wait timeout in seconds (used with --wait)",
106+
),
107+
) -> None:
108+
"""Display project status and optional cluster health verification."""
109+
status.status(wait=wait, timeout=timeout)
98110

99111

100112
def main() -> None:

src/kube_galaxy/cmd/status.py

Lines changed: 82 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,111 +1,114 @@
11
"""Status command handler."""
22

33
import shutil
4-
import subprocess
5-
from pathlib import Path
4+
from collections.abc import Callable
65

7-
from kube_galaxy.pkg.utils.logging import info, print_dict, section
6+
import typer
87

8+
from kube_galaxy.pkg.utils.client import (
9+
get_cluster_info,
10+
get_context,
11+
get_nodes,
12+
get_pods,
13+
wait_for_nodes,
14+
wait_for_pods,
15+
)
16+
from kube_galaxy.pkg.utils.errors import ClusterError
17+
from kube_galaxy.pkg.utils.logging import error, info, print_dict, section, success, warning
18+
from kube_galaxy.pkg.utils.shell import run
919

10-
def status() -> None:
11-
"""Display project status including dependencies and file counts."""
20+
21+
def status(wait: bool = False, timeout: int = 300) -> None:
22+
"""Display project status and optionally verify cluster health."""
1223
section("Kubernetes Galaxy Test - Project Status")
1324

14-
# Check dependencies
25+
_print_dependency_status()
26+
_print_cluster_context()
27+
28+
if wait:
29+
_verify_cluster_health(timeout)
30+
success("Cluster is healthy")
31+
32+
33+
def _print_dependency_status() -> None:
34+
"""Print required command dependency status."""
1535
info("Dependencies:")
1636
deps = {
17-
"kubectl": check_command("kubectl"),
18-
"kubeadm": check_command("kubeadm"),
19-
"spread": check_command("spread"),
37+
"kubectl": _check_command("kubectl"),
38+
"spread": _check_command("spread"),
2039
}
2140
print_dict(deps)
2241

23-
# Count project files
42+
43+
def _print_cluster_context() -> None:
44+
"""Print active cluster context and current node table if available."""
45+
if not shutil.which("kubectl"):
46+
warning("kubectl not available; skipping cluster checks")
47+
return
48+
2449
info("")
25-
info("Project Files:")
26-
file_counts = {
27-
"Manifests": len(list(Path("manifests").glob("*.yaml")))
28-
if Path("manifests").exists()
29-
else 0,
30-
"Workflows": len(
31-
list(Path(".github/workflows").glob("*.yml"))
32-
+ list(Path(".github/workflows").glob("*.yaml"))
33-
)
34-
if Path(".github/workflows").exists()
35-
else 0,
36-
"Actions": len(
37-
list(Path(".github/actions").glob("*/action.yml"))
38-
+ list(Path(".github/actions").glob("*/action.yaml"))
39-
)
40-
if Path(".github/actions").exists()
41-
else 0,
42-
"Tests": len(list(Path("tests").glob("*.yaml")) + list(Path("tests").glob("*.yml")))
43-
if Path("tests").exists()
44-
else 0,
45-
}
46-
print_dict(file_counts)
50+
try:
51+
context = get_context()
52+
info(f"Active Cluster: {context}")
53+
nodes_output = get_nodes()
54+
if nodes_output:
55+
lines = nodes_output.strip().split("\n")
56+
info(f"Cluster Nodes: {len(lines) - 1}")
57+
for line in lines[1:]:
58+
if line:
59+
info(f" {line}")
60+
except ClusterError:
61+
info("Active Cluster: error checking")
4762

48-
# Show kubeadm cluster nodes
49-
if shutil.which("kubectl"):
50-
info("")
51-
try:
52-
result = subprocess.run(
53-
["kubectl", "get", "nodes"],
54-
capture_output=True,
55-
text=True,
56-
check=False,
57-
)
58-
if result.returncode == 0 and result.stdout:
59-
lines = result.stdout.strip().split("\n")
60-
info(f"Cluster Nodes: {len(lines) - 1}") # Subtract header
61-
for line in lines[1:]: # Skip header
62-
if line:
63-
info(f" {line}")
64-
except Exception:
65-
pass
6663

67-
# Show active cluster
68-
if shutil.which("kubectl"):
69-
info("")
70-
try:
71-
result = subprocess.run(
72-
["kubectl", "config", "current-context"],
73-
capture_output=True,
74-
text=True,
75-
check=False,
76-
)
77-
if result.returncode == 0:
78-
context = result.stdout.strip()
79-
info(f"Active Cluster: {context}")
80-
else:
81-
info("Active Cluster: none")
82-
except Exception:
83-
info("Active Cluster: error checking")
64+
def _verify_cluster_health(timeout: int) -> None:
65+
"""Wait for cluster readiness and print summary tables."""
66+
if not shutil.which("kubectl"):
67+
error("kubectl is required for --wait health checks", show_traceback=False)
68+
raise typer.Exit(code=1)
69+
70+
section("Cluster Health Verification")
71+
info("Waiting for nodes to be Ready...")
8472

73+
try:
74+
wait_for_nodes(timeout=timeout)
75+
wait_for_pods(namespace="kube-system", timeout=timeout)
76+
except ClusterError as exc:
77+
error(str(exc), show_traceback=False)
78+
error("Cluster readiness checks failed", show_traceback=False)
79+
raise typer.Exit(code=1) from exc
8580

86-
def check_command(cmd: str) -> str:
81+
_print_command_output(get_cluster_info, "Cluster Info")
82+
_print_command_output(get_nodes, "Nodes")
83+
_print_command_output(get_pods, "Pods")
84+
85+
86+
def _print_command_output(command: Callable[[], str], title: str) -> None:
87+
"""Run command and print its output with a section label."""
88+
info("")
89+
info(f"{title}:")
90+
try:
91+
if output := command().strip():
92+
info(output)
93+
except ClusterError as exc:
94+
error(f"Failed to run: {title}", show_traceback=False)
95+
raise typer.Exit(code=1) from exc
96+
97+
98+
def _check_command(cmd: str) -> str:
8799
"""Check if a command is installed and return status."""
88100
if shutil.which(cmd):
89101
try:
90102
if cmd == "kubectl":
91-
result = subprocess.run(
103+
result = run(
92104
[cmd, "version", "--client"],
93105
capture_output=True,
94-
text=True,
95-
check=False,
96-
)
97-
elif cmd == "kubeadm":
98-
result = subprocess.run(
99-
[cmd, "version"],
100-
capture_output=True,
101-
text=True,
102106
check=False,
103107
)
104108
else:
105-
result = subprocess.run(
109+
result = run(
106110
[cmd, "--version"],
107111
capture_output=True,
108-
text=True,
109112
check=False,
110113
)
111114

src/kube_galaxy/cmd/test.py

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Test command handler."""
22

3-
import subprocess
43
from pathlib import Path
54

65
import typer
@@ -9,6 +8,8 @@
98
from kube_galaxy.pkg.manifest.loader import load_manifest
109
from kube_galaxy.pkg.manifest.validator import validate_manifest
1110
from kube_galaxy.pkg.testing.spread import collect_test_results, run_spread_tests
11+
from kube_galaxy.pkg.utils.client import get_context, verify_connectivity
12+
from kube_galaxy.pkg.utils.errors import ClusterError
1213
from kube_galaxy.pkg.utils.logging import error, exception, info, section, success, warning
1314

1415

@@ -22,25 +23,10 @@ def spread(manifest_path: str) -> None:
2223

2324
try:
2425
# Check if kubectl can connect
25-
result = subprocess.run(
26-
["kubectl", "cluster-info"],
27-
capture_output=True,
28-
text=True,
29-
check=False,
30-
)
31-
if result.returncode != 0:
32-
error("No Kubernetes cluster available. Please set up a cluster first.")
33-
info("You can create a test cluster with: kube-galaxy setup")
34-
raise typer.Exit(code=1)
26+
verify_connectivity()
3527

3628
# Get cluster context
37-
result = subprocess.run(
38-
["kubectl", "config", "current-context"],
39-
capture_output=True,
40-
text=True,
41-
check=True,
42-
)
43-
cluster_context = result.stdout.strip()
29+
cluster_context = get_context()
4430
success(f"Connected to cluster: {cluster_context}")
4531

4632
# Run spread tests from manifest
@@ -57,7 +43,7 @@ def spread(manifest_path: str) -> None:
5743

5844
success("Spread tests completed")
5945

60-
except Exception as e:
46+
except ClusterError as e:
6147
exception("Spread tests failed", e)
6248
raise typer.Exit(code=1) from e
6349

src/kube_galaxy/pkg/cluster.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Cluster setup and provisioning with 8-stage component lifecycle."""
22

3+
from concurrent.futures import ThreadPoolExecutor
34
from pathlib import Path
45

56
from kube_galaxy.pkg.arch.detector import ArchInfo, get_arch_info
@@ -81,7 +82,7 @@ def setup_cluster(manifest_path: str, work_dir: str = ".") -> None:
8182
num_hooks = len(SetupHooks)
8283
for idx, hook in enumerate(SetupHooks, 1):
8384
section(f"Stage {idx}/{num_hooks}: {hook.value.capitalize()} Components")
84-
_run_hook(instances_list, configs, hook.value)
85+
_run_hook(instances_list, configs, hook.value, parallel=hook.is_parallel)
8586

8687
section("Cluster Setup Complete!")
8788
success("Kubeconfig: $HOME/.kube/config")
@@ -128,7 +129,7 @@ def teardown_cluster(manifest_path: str, force: bool = False) -> None:
128129
num_hooks = len(TeardownHooks)
129130
for idx, hook in enumerate(TeardownHooks):
130131
section(f"Stage {idx + 1}/{num_hooks}: {hook.value.capitalize()} Components")
131-
_run_hook(instances_list, configs, hook.value, force)
132+
_run_hook(instances_list, configs, hook.value, force, parallel=hook.is_parallel)
132133

133134
# Final cleanup: remove any remaining kube-galaxy alternatives
134135
_cleanup_kube_galaxy_alternatives(force)
@@ -151,6 +152,7 @@ def _run_hook(
151152
configs: list[ComponentConfig],
152153
hook_name: str,
153154
force: bool = False,
155+
parallel: bool = False,
154156
) -> None:
155157
"""
156158
Run a specific lifecycle hook for all components.
@@ -160,24 +162,36 @@ def _run_hook(
160162
configs: List of component configs (must be in same order as instances)
161163
hook_name: Name of the hook to run (e.g., "install")
162164
force: Continue execution even if errors occur
165+
parallel: Execute hooks concurrently (respects component order for submission)
163166
164167
Raises:
165168
ClusterError: If any component hook fails
166169
"""
167170
hook_name_caps = hook_name.title()
168-
for config, instance in zip(configs, instances, strict=True):
169-
info(f" {config.name}: {hook_name_caps}...")
170-
hook_method = getattr(instance, f"{hook_name}_hook", None)
171-
if not hook_method:
172-
raise ClusterError(f"{hook_name_caps} hook not implemented for {config.name}")
173-
try:
174-
hook_method()
175-
except Exception as exc:
176-
forced = " (continuing due to --force)" if force else ""
177-
message = f"{hook_name_caps} failed for {config.name}{forced}: {exc}"
178-
exception(f" ✗ {message}", exc)
179-
if not force:
180-
raise ClusterError(message) from exc
171+
max_workers = 10 if parallel else 1
172+
173+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
174+
futures_list = []
175+
176+
# Submit all tasks in component order
177+
for config, instance in zip(configs, instances, strict=True):
178+
hook_method = getattr(instance, f"{hook_name}_hook", None)
179+
if not hook_method:
180+
raise ClusterError(f"{hook_name_caps} hook not implemented for {config.name}")
181+
info(f" {config.name}: {hook_name_caps}...")
182+
future = executor.submit(hook_method)
183+
futures_list.append((config.name, future))
184+
185+
# Collect results in submission order
186+
for component_name, future in futures_list:
187+
try:
188+
future.result()
189+
except Exception as exc:
190+
forced = " (continuing due to --force)" if force else ""
191+
message = f"{hook_name_caps} failed for {component_name}{forced}: {exc}"
192+
exception(f" ✗ {message}", exc)
193+
if not force:
194+
raise ClusterError(message) from exc
181195

182196

183197
def _cleanup_kube_galaxy_alternatives(force: bool) -> None:

0 commit comments

Comments
 (0)