Skip to content

Commit 1f89106

Browse files
committed
From mgirgis: Add Cloudkitty healthcheck.py
1 parent 6081ab1 commit 1f89106

File tree

2 files changed

+98
-38
lines changed

2 files changed

+98
-38
lines changed

pkg/cloudkittyproc/statefulset.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
appsv1 "k8s.io/api/apps/v1"
2525
corev1 "k8s.io/api/core/v1"
2626
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/util/intstr"
2728
)
2829

2930
const (
@@ -44,7 +45,7 @@ func StatefulSet(
4445
// cloudKittyGroup := int64(telemetryv1.CloudKittyGroupID)
4546

4647
// TODO until we determine how to properly query for these
47-
/*livenessProbe := &corev1.Probe{
48+
livenessProbe := &corev1.Probe{
4849
// TODO might need tuning
4950
TimeoutSeconds: 5,
5051
PeriodSeconds: 3,
@@ -56,19 +57,18 @@ func StatefulSet(
5657
FailureThreshold: 12,
5758
PeriodSeconds: 5,
5859
InitialDelaySeconds: 5,
59-
}*/
60+
}
6061

6162
args := []string{"-c", ServiceCommand}
62-
/*var probeCommand []string
63+
var probeCommand []string
6364
livenessProbe.HTTPGet = &corev1.HTTPGetAction{
6465
Port: intstr.FromInt(8080),
6566
}
6667
startupProbe.HTTPGet = livenessProbe.HTTPGet
6768
probeCommand = []string{
68-
"/usr/local/bin/container-scripts/healthcheck.py",
69-
"processor",
70-
"/etc/cloudkitty/cloudkitty.conf.d",
71-
}*/
69+
"/var/lib/openstack/bin/healthcheck.py",
70+
"/etc/cloudkitty/cloudkitty.conf.d/cloudkitty.conf",
71+
}
7272

7373
envVars := map[string]env.Setter{}
7474
envVars["KOLLA_CONFIG_STRATEGY"] = env.SetValue("COPY_ALWAYS")
@@ -112,13 +112,13 @@ func StatefulSet(
112112
SecurityContext: &corev1.SecurityContext{
113113
RunAsUser: &cloudKittyUser,
114114
},
115-
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
116-
VolumeMounts: volumeMounts,
117-
Resources: instance.Spec.Resources,
118-
//LivenessProbe: livenessProbe,
119-
//StartupProbe: startupProbe,
115+
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
116+
VolumeMounts: volumeMounts,
117+
Resources: instance.Spec.Resources,
118+
LivenessProbe: livenessProbe,
119+
StartupProbe: startupProbe,
120120
},
121-
/*{
121+
{
122122
Name: "probe",
123123
Command: probeCommand,
124124
Image: instance.Spec.ContainerImage,
@@ -127,7 +127,7 @@ func StatefulSet(
127127
//RunAsGroup: &cloudKittyGroup,
128128
},
129129
VolumeMounts: volumeMounts,
130-
},*/
130+
},
131131
},
132132
Volumes: volumes,
133133
},
@@ -149,4 +149,4 @@ func StatefulSet(
149149
}
150150

151151
return statefulset
152-
}
152+
}

templates/cloudkitty/bin/healthcheck.py

Lines changed: 83 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,43 +14,74 @@
1414
# License for the specific language governing permissions and limitations
1515
# under the License.
1616

17-
# Trivial HTTP server to check health of scheduler, backup and volume services.
18-
# Cinder-API hast its own health check endpoint and does not need this.
19-
#
20-
# The only check this server currently does is using the heartbeat in the
21-
# database service table, accessing the DB directly here using cinder's
22-
# configuration options.
23-
#
24-
# The benefit of accessing the DB directly is that it doesn't depend on the
25-
# Cinder-API service being up and we can also differentiate between the
26-
# container not having a connection to the DB and the cinder service not doing
27-
# the heartbeats.
28-
#
29-
# For volume services all enabled backends must be up to return 200, so it is
30-
# recommended to use a different pod for each backend to avoid one backend
31-
# affecting others.
32-
#
33-
# Requires the name of the service as the first argument (volume, backup,
34-
# scheduler) and optionally a second argument with the location of the
35-
# configuration directory (defaults to /etc/cinder/cinder.conf.d)
3617

3718
from http import server
3819
import signal
3920
import socket
4021
import sys
4122
import time
4223
import threading
24+
import requests
4325

4426
from oslo_config import cfg
4527

28+
4629
SERVER_PORT = 8080
4730
CONF = cfg.CONF
4831

32+
4933
class HTTPServerV6(server.HTTPServer):
50-
address_family = socket.AF_INET6
34+
address_family = socket.AF_INET6
35+
5136

5237
class HeartbeatServer(server.BaseHTTPRequestHandler):
38+
39+
@staticmethod
40+
def check_services():
41+
print("Starting health checks")
42+
results = {}
43+
44+
# Todo Database Endpoint Reachability
45+
# Keystone Endpoint Reachability
46+
try:
47+
keystone_uri = CONF.keystone_authtoken.auth_url
48+
response = requests.get(keystone_uri, timeout=5)
49+
response.raise_for_status()
50+
server_header = response.headers.get('Server', '').lower()
51+
if 'keystone' in server_header:
52+
results['keystone_endpoint'] = 'OK'
53+
print("Keystone endpoint reachable and responsive.")
54+
else:
55+
results['keystone_endpoint'] = 'WARN'
56+
print(f"Keystone endpoint reachable, but not a valid Keystone service: {keystone_uri}")
57+
except requests.exceptions.RequestException as e:
58+
results['keystone_endpoint'] = 'FAIL'
59+
print(f"ERROR: Keystone endpoint check failed: {e}")
60+
raise Exception('ERROR: Keystone check failed', e)
61+
62+
# Prometheus Collector Endpoint Reachability
63+
try:
64+
prometheus_url = CONF.collector_prometheus.prometheus_url
65+
insecure = CONF.collector_prometheus.insecure
66+
cafile = CONF.collector_prometheus.cafile
67+
verify_ssl = cafile if cafile and not insecure else not insecure
68+
69+
response = requests.get(prometheus_url, timeout=5, verify=verify_ssl)
70+
response.raise_for_status()
71+
results['collector_endpoint'] = 'OK'
72+
print("Prometheus collector endpoint reachable.")
73+
except requests.exceptions.RequestException as e:
74+
results['collector_endpoint'] = 'FAIL'
75+
print(f"ERROR: Prometheus collector check failed: {e}")
76+
raise Exception('ERROR: Prometheus collector check failed', e)
77+
5378
def do_GET(self):
79+
try:
80+
self.check_services()
81+
except Exception as exc:
82+
self.send_error(500, exc.args[0], exc.args[1])
83+
return
84+
5485
self.send_response(200)
5586
self.send_header("Content-type", "text/html")
5687
self.end_headers()
@@ -68,12 +99,41 @@ def stopper(signal_number=None, frame=None):
6899

69100

70101
if __name__ == "__main__":
102+
# Register config options
103+
cfg.CONF.register_group(cfg.OptGroup(name='database', title='Database connection options'))
104+
cfg.CONF.register_opt(cfg.StrOpt('connection', default=None), group='database')
105+
106+
cfg.CONF.register_group(cfg.OptGroup(name='keystone_authtoken', title='Keystone Auth Token Options'))
107+
cfg.CONF.register_opt(cfg.StrOpt('auth_url',
108+
default='https://keystone-internal.openstack.svc:5000'),
109+
group='keystone_authtoken')
110+
111+
cfg.CONF.register_group(cfg.OptGroup(name='collector_prometheus', title='Prometheus Collector Options'))
112+
cfg.CONF.register_opt(cfg.StrOpt('prometheus_url',
113+
default='http://metric-storage-prometheus.openstack.svc:9090'),
114+
group='collector_prometheus')
115+
cfg.CONF.register_opt(cfg.BoolOpt('insecure', default=False), group='collector_prometheus')
116+
cfg.CONF.register_opt(cfg.StrOpt('cafile', default=None), group='collector_prometheus')
117+
118+
# Load configuration from file
119+
try:
120+
cfg.CONF(sys.argv[1:], default_config_files=['/etc/cloudkitty/cloudkitty.conf.d/cloudkitty.conf'])
121+
except cfg.ConfigFilesNotFoundError as e:
122+
print(f"Health check failed: {e}", file=sys.stderr)
123+
sys.exit(1)
124+
125+
# Detect IPv6 support for binding
71126
hostname = socket.gethostname()
72-
ipv6_address = socket.getaddrinfo(hostname, None, socket.AF_INET6)
127+
try:
128+
ipv6_address = socket.getaddrinfo(hostname, None, socket.AF_INET6)
129+
except socket.gaierror:
130+
ipv6_address = None
131+
73132
if ipv6_address:
74-
webServer = HTTPServerV6(("::",SERVER_PORT), HeartbeatServer)
133+
webServer = HTTPServerV6(("::", SERVER_PORT), HeartbeatServer)
75134
else:
76135
webServer = server.HTTPServer(("0.0.0.0", SERVER_PORT), HeartbeatServer)
136+
77137
stop = get_stopper(webServer)
78138

79139
# Need to run the server on a different thread because its shutdown method
@@ -91,4 +151,4 @@ def stopper(signal_number=None, frame=None):
91151
except KeyboardInterrupt:
92152
pass
93153
finally:
94-
stop()
154+
stop()

0 commit comments

Comments
 (0)