Skip to content

Commit e5865a8

Browse files
authored
Adding connection checks and unit tests (#14)
1 parent 47521cd commit e5865a8

File tree

4 files changed

+358
-86
lines changed

4 files changed

+358
-86
lines changed

docs/Agents.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,61 @@ GITHUB_ACTIONS=true # Simulate GitHub Actions
252252
- Shared utilities for log streaming and file operations
253253
- Centralized error state detection logic
254254

255+
---
256+
257+
## Session: 2026-02-24 — Connectivity Checks Overhaul
258+
259+
### KUBECONFIG env var support
260+
- `LoadK8sConfig()` now uses `clientcmd.NewDefaultClientConfigLoadingRules()` instead of hardcoding `~/.kube/config`
261+
- Respects `KUBECONFIG` env var and colon-separated paths (same resolution order as `kubectl`)
262+
- File: `pkg/utils/kubeconfig.go`
263+
264+
### Connectivity script extracted to `connectivity_check.py`
265+
- Python script moved out of the Go string literal into `pkg/debug/connectivity_check.py`
266+
- Embedded at build time via `//go:embed connectivity_check.py`
267+
- Namespace substituted by Go: `strings.ReplaceAll(script, "{ns}", namespace)`
268+
- `{ns}` in plain strings → replaced by Go; in f-strings → use `ns = '{ns}'` Python variable trick
269+
- File: `pkg/debug/connections.go`, `pkg/debug/connectivity_check.py`
270+
271+
### Refactored connectivity checks with shared data structure
272+
Eliminated per-service copy-paste. Two helper functions cover all patterns:
273+
- `parse_url_service(name, env_var, default_port)` — single URL env var (Postgres, ClickHouse)
274+
- `parse_host_port_service(name, host_env, port_env, note)` — split HOST+PORT env vars (Redis, RabbitMQ)
275+
276+
Adding a new service = one line in the `services` list.
277+
278+
### Services now tested from inside the API pod
279+
280+
| Service | Method | Source |
281+
|---|---|---|
282+
| Authz | TCP `:3592` | hardcoded service name |
283+
| Runners API | TCP `:8090` | hardcoded service name |
284+
| UI | TCP `:3000` | hardcoded service name |
285+
| Minio | HTTP GET `/minio/health/ready` | `minio-client.{ns}.svc.cluster.local:9000` |
286+
| ClickHouse replica-N | HTTP GET `/ping` | `chi-clickhouse-cluster-0-N.{ns}.svc.cluster.local:8123` |
287+
| PostgreSQL | TCP | `GALILEO_DATABASE_URL_READ` |
288+
| ClickHouse | TCP | `GALILEO_CLICKHOUSE_URL_READ_WRITE` |
289+
| Redis | TCP | `GALILEO_REDIS_HOST` + `GALILEO_REDIS_PORT` |
290+
| RabbitMQ | TCP | `GALILEO_RABBITMQ_HOST` + `GALILEO_RABBITMQ_PORT` |
291+
292+
ClickHouse replicas are dynamically generated from `GALILEO_CLICKHOUSE_EXPECTED_REPLICAS` — mirrors what the `wait-deps` init container checks.
293+
294+
### Infrastructure notes discovered
295+
- **Redis**: Azure Managed Redis (external) — env vars: `GALILEO_REDIS_HOST`, `GALILEO_REDIS_PORT` (10000), `GALILEO_REDIS_PROTOCOL` (`rediss+poll`), `GALILEO_REDIS_PASSWORD`, `GALILEO_REDIS_REQUIRE`
296+
- **RabbitMQ**: env vars `GALILEO_RABBITMQ_HOST`, `GALILEO_RABBITMQ_PORT` (5671 TLS / 5672 plain), `GALILEO_RABBITMQ_TLS_ENABLED`
297+
- **Minio**: PVC-based on Azure — check via HTTP health endpoint, not pod readiness
298+
- **ClickHouse Keeper**: API pod does not talk to Keeper directly. Keeper is checked via Go-side `checkDependencyHealth` only
299+
- **runner-large, runners-metrics-controller**: Worker pods with no Kubernetes Service — cannot be TCP-checked
300+
301+
### `wait-deps` init container
302+
Waits for (in order):
303+
1. `minio-client.{ns}:9000/minio/health/ready` → HTTP 200
304+
2. `chi-clickhouse-cluster-0-{i}.{ns}:8123/ping` → HTTP 200 (for each replica)
305+
306+
If stuck in `PodInitializing`, either Minio or ClickHouse (which depends on Keeper) is not ready.
307+
308+
---
309+
255310
## Quick Reference
256311

257312
### Common Commands

pkg/debug/connections.go

Lines changed: 14 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package debug
22

33
import (
4+
_ "embed"
5+
46
"bufio"
57
"context"
68
"errors"
@@ -18,6 +20,9 @@ import (
1820
"galileoctl/pkg/utils"
1921
)
2022

23+
//go:embed connectivity_check.py
24+
var connectivityCheckScript string
25+
2126
// ApiConnections runs the connectivity checks from an api pod inside the given namespace.
2227
func ApiConnections(namespace string, tailLines int) error {
2328
if namespace == "" {
@@ -49,64 +54,8 @@ func ApiConnections(namespace string, tailLines int) error {
4954

5055
fmt.Printf("✅ Using pod: %s\n", pod)
5156

52-
// Run the same Python connectivity checks inside the pod
53-
py := `import socket
54-
import os
55-
from urllib.parse import urlparse
56-
57-
services = [
58-
'rabbitmq-cluster.%s.svc.cluster.local',
59-
'authz.%s.svc.cluster.local'
60-
]
61-
62-
print('Starting connectivity tests...')
63-
print('')
64-
65-
for svc in services:
66-
print(f'Testing connectivity to {svc}...')
67-
try:
68-
socket.setdefaulttimeout(5)
69-
result = socket.getaddrinfo(svc, None)
70-
if result:
71-
print(f'✅ SUCCESS: {svc} is reachable (DNS resolved)')
72-
else:
73-
print(f'❌ FAILED: {svc} DNS resolution failed')
74-
except Exception as e:
75-
print(f'❌ FAILED: {svc} is not reachable ({str(e)})')
76-
print('')
77-
78-
env_vars = [
79-
'GALILEO_DATABASE_URL_READ',
80-
'GALILEO_CLICKHOUSE_URL_READ_WRITE'
81-
]
82-
83-
for env_var in env_vars:
84-
url = os.getenv(env_var)
85-
if url:
86-
print(f'Testing connectivity to {env_var}...')
87-
try:
88-
parsed = urlparse(url)
89-
host = parsed.hostname
90-
port = parsed.port or (5432 if 'postgresql' in url else 9000)
91-
print(f' Connecting to {host}:{port}...')
92-
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
93-
sock.settimeout(5)
94-
result = sock.connect_ex((host, port))
95-
sock.close()
96-
if result == 0:
97-
print(f'✅ SUCCESS: {env_var} ({host}:{port}) is reachable')
98-
else:
99-
print(f'❌ FAILED: {env_var} ({host}:{port}) connection refused')
100-
except Exception as e:
101-
print(f'❌ FAILED: {env_var} connectivity test failed ({str(e)})')
102-
else:
103-
print(f'⚠️ WARNING: {env_var} environment variable not found')
104-
print('')
105-
106-
print('Connectivity tests completed.')
107-
`
108-
109-
py = fmt.Sprintf(py, namespace, namespace)
57+
// Run connectivity checks inside the pod (script embedded from connectivity_check.py)
58+
py := strings.ReplaceAll(connectivityCheckScript, "{ns}", namespace)
11059

11160
// Load Kubernetes config
11261
cfg, client, err := utils.LoadK8sConfig()
@@ -328,6 +277,13 @@ func diagnoseLogs(logLines []string) string {
328277
issues = append(issues, "❌ ClickHouse connection failed (check if ClickHouse pods are running)")
329278
}
330279

280+
// Redis connection issues
281+
if strings.Contains(allLogs, "redis") && (strings.Contains(allLogs, "connection refused") ||
282+
strings.Contains(allLogs, "failed to connect") ||
283+
strings.Contains(allLogs, "10000")) {
284+
issues = append(issues, "❌ Redis connection failed (check Azure Managed Redis reachability)")
285+
}
286+
331287
// RabbitMQ connection issues
332288
if strings.Contains(allLogs, "rabbitmq") && (strings.Contains(allLogs, "connection refused") ||
333289
strings.Contains(allLogs, "failed to connect") ||

0 commit comments

Comments
 (0)