Skip to content

Commit e5b28de

Browse files
committed
Improve Ceilometer probes
- increases initial delay for SG probes to avoid restarts during deploy - adds liveness probes to polling agents
1 parent b4285cb commit e5b28de

File tree

4 files changed

+285
-30
lines changed

4 files changed

+285
-30
lines changed

controllers/ceilometer_controller.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,12 +1206,16 @@ func (r *CeilometerReconciler) generateServiceConfig(
12061206
cms := []util.Template{
12071207
// ScriptsSecrets
12081208
{
1209-
Name: fmt.Sprintf("%s-scripts", ceilometer.ServiceName),
1210-
Namespace: instance.Namespace,
1211-
Type: util.TemplateTypeScripts,
1212-
InstanceType: "ceilometercentral",
1213-
AdditionalTemplate: map[string]string{"common.sh": "/common/common.sh"},
1214-
Labels: cmLabels,
1209+
Name: fmt.Sprintf("%s-scripts", ceilometer.ServiceName),
1210+
Namespace: instance.Namespace,
1211+
Type: util.TemplateTypeScripts,
1212+
InstanceType: "ceilometercentral",
1213+
AdditionalTemplate: map[string]string{
1214+
"common.sh": "/common/common.sh",
1215+
"centralhealth.py": "/ceilometercentral/bin/centralhealth.py",
1216+
"notificationhealth.py": "/ceilometercentral/bin/notificationhealth.py",
1217+
},
1218+
Labels: cmLabels,
12151219
},
12161220
// Secrets
12171221
{

pkg/ceilometer/statefulset.go

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ import (
3434

3535
const (
3636
// ServiceCommand -
37-
ServiceCommand = "/usr/local/bin/kolla_set_configs && /usr/local/bin/kolla_start"
37+
ServiceCommand = "/usr/local/bin/kolla_set_configs && /usr/local/bin/kolla_start"
38+
CentralHCScript = "/var/lib/openstack/bin/centralhealth.py"
39+
NotificationHCScript = "/var/lib/openstack/bin/notificationhealth.py"
3840
)
3941

4042
// StatefulSet func
@@ -45,32 +47,51 @@ func StatefulSet(
4547
) (*appsv1.StatefulSet, error) {
4648
runAsUser := int64(0)
4749

48-
// TO-DO Probes
49-
livenessProbe := &corev1.Probe{
50-
// TODO might need tuning
50+
// container probes
51+
sgRootEndpointCurl := corev1.HTTPGetAction{
52+
Path: "/",
53+
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(CeilometerPrometheusPort)},
54+
}
55+
sgLivenessProbe := &corev1.Probe{
5156
TimeoutSeconds: 30,
5257
PeriodSeconds: 30,
53-
InitialDelaySeconds: 5,
58+
InitialDelaySeconds: 300,
5459
}
55-
readinessProbe := &corev1.Probe{
56-
// TODO might need tuning
60+
sgLivenessProbe.HTTPGet = &sgRootEndpointCurl
61+
62+
sgReadinessProbe := &corev1.Probe{
5763
TimeoutSeconds: 30,
5864
PeriodSeconds: 30,
59-
InitialDelaySeconds: 5,
65+
InitialDelaySeconds: 10,
66+
}
67+
sgReadinessProbe.HTTPGet = &sgRootEndpointCurl
68+
69+
//NOTE(mmagr): Once we will be sure (OSP19 timeframe) that we have Ceilometer
70+
// running with heartbeat feature, we can make below probes run much
71+
// less often (poll interval is 5 minutes currently). Right now we need
72+
// to execute HC as often as possible to hit times when pollers connect
73+
// to OpenStack API nodes
74+
centralLivenessProbe := &corev1.Probe{
75+
TimeoutSeconds: 5,
76+
PeriodSeconds: 5,
77+
InitialDelaySeconds: 300,
78+
}
79+
centralLivenessProbe.Exec = &corev1.ExecAction{
80+
Command: []string{"/usr/bin/python3", CentralHCScript},
6081
}
6182

62-
args := []string{"-c"}
63-
args = append(args, ServiceCommand)
64-
65-
livenessProbe.HTTPGet = &corev1.HTTPGetAction{
66-
Path: "/",
67-
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(CeilometerPrometheusPort)},
83+
notificationLivenessProbe := &corev1.Probe{
84+
TimeoutSeconds: 5,
85+
PeriodSeconds: 30,
86+
InitialDelaySeconds: 300,
6887
}
69-
readinessProbe.HTTPGet = &corev1.HTTPGetAction{
70-
Path: "/",
71-
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(CeilometerPrometheusPort)},
88+
notificationLivenessProbe.Exec = &corev1.ExecAction{
89+
Command: []string{"/usr/bin/python3", NotificationHCScript},
7290
}
7391

92+
args := []string{"-c"}
93+
args = append(args, ServiceCommand)
94+
7495
envVarsCentral := map[string]env.Setter{}
7596
envVarsCentral["KOLLA_CONFIG_STRATEGY"] = env.SetValue("COPY_ALWAYS")
7697
envVarsCentral["CONFIG_HASH"] = env.SetValue(configHash)
@@ -95,8 +116,8 @@ func StatefulSet(
95116
svc.CertMount = ptr.To(fmt.Sprintf("/etc/pki/tls/certs/%s", tls.CertKey))
96117
svc.KeyMount = ptr.To(fmt.Sprintf("/etc/pki/tls/private/%s", tls.PrivateKey))
97118

98-
livenessProbe.HTTPGet.Scheme = corev1.URISchemeHTTPS
99-
readinessProbe.HTTPGet.Scheme = corev1.URISchemeHTTPS
119+
sgLivenessProbe.HTTPGet.Scheme = corev1.URISchemeHTTPS
120+
sgReadinessProbe.HTTPGet.Scheme = corev1.URISchemeHTTPS
100121

101122
volumes = append(volumes, svc.CreateVolume(ServiceName))
102123
httpdVolumeMounts = append(httpdVolumeMounts, svc.CreateVolumeMounts(ServiceName)...)
@@ -123,7 +144,8 @@ func StatefulSet(
123144
SecurityContext: &corev1.SecurityContext{
124145
RunAsUser: &runAsUser,
125146
},
126-
VolumeMounts: centralVolumeMounts,
147+
VolumeMounts: centralVolumeMounts,
148+
LivenessProbe: centralLivenessProbe,
127149
}
128150
notificationAgentContainer := corev1.Container{
129151
ImagePullPolicy: corev1.PullAlways,
@@ -137,7 +159,8 @@ func StatefulSet(
137159
SecurityContext: &corev1.SecurityContext{
138160
RunAsUser: &runAsUser,
139161
},
140-
VolumeMounts: notificationVolumeMounts,
162+
VolumeMounts: notificationVolumeMounts,
163+
LivenessProbe: notificationLivenessProbe,
141164
}
142165
sgCoreContainer := corev1.Container{
143166
ImagePullPolicy: corev1.PullAlways,
@@ -156,12 +179,12 @@ func StatefulSet(
156179
RunAsUser: &runAsUser,
157180
},
158181
Ports: []corev1.ContainerPort{{
159-
ContainerPort: 3000,
182+
ContainerPort: int32(CeilometerPrometheusPort),
160183
Name: "proxy-httpd",
161184
}},
162185
VolumeMounts: httpdVolumeMounts,
163-
ReadinessProbe: readinessProbe,
164-
LivenessProbe: livenessProbe,
186+
ReadinessProbe: sgReadinessProbe,
187+
LivenessProbe: sgLivenessProbe,
165188
Command: []string{"/usr/sbin/httpd"},
166189
Args: []string{"-DFOREGROUND"},
167190
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
# Copyright 2025 Inc.
4+
# All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
7+
# not use this file except in compliance with the License. You may obtain
8+
# a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15+
# License for the specific language governing permissions and limitations
16+
# under the License.
17+
18+
import datetime
19+
import json
20+
import os
21+
import psutil
22+
import socket
23+
import sys
24+
25+
26+
HEARTBEAT_SOCKET = "/var/lib/ceilometer/ceilometer-central.socket"
27+
POLL_SPREAD = datetime.timedelta(minutes=15)
28+
29+
CONN_SPREAD = datetime.timedelta(hours=1)
30+
PROCESS_NAME = "ceilometer-polling"
31+
PROCESS_CONN_CACHE = "/var/lib/ceilometer/tmp/connections"
32+
# Keystosne 5000
33+
# Cinder 8776
34+
# Glance 9292
35+
# Neutron 9696
36+
PROCESS_PORTS = [5000, 8776, 9292, 9696]
37+
38+
39+
def check_pollsters(socket_path: str) -> tuple[int, str]:
40+
"""
41+
Returns 0 if socket_path content contains all records
42+
of polling no older than POLL_SPREAD, otherwise returns 1.
43+
"""
44+
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
45+
s.connect(socket_path)
46+
47+
report = ""
48+
try:
49+
while True:
50+
data = s.recv(2048)
51+
if data:
52+
report += data.decode('utf-8')
53+
else:
54+
break
55+
finally:
56+
s.close()
57+
58+
limit = datetime.datetime.today() - POLL_SPREAD
59+
for line in report.split('\n'):
60+
if not line:
61+
# Skip empty line, which might occur for cases
62+
# when no pollster had polled yet. In such case
63+
# we assume agent healthy as at least the socket
64+
# has been successfully created.
65+
continue
66+
pollster, timestr = line.split()
67+
timestamp = datetime.datetime.fromisoformat(timestr)
68+
69+
if timestamp < limit:
70+
return 1, f"{pollster}'s timestamp is out of limit"
71+
return 0, ""
72+
73+
74+
def check_connection(cache_path: str,
75+
process_name: str,
76+
ports: list[int]) -> tuple[int, str]:
77+
"""
78+
Returns 0 if given process is or was recently connected
79+
to the given port(s). Otherwise returns 1 or 2 with respective
80+
reason of failure.
81+
"""
82+
# NOT(mmagr): process' connections can be empty if we hit the case
83+
# when no pollster was polling from some API during running
84+
# this script. The previous values will be checked from cache.
85+
# Hence to avoid false negatives we need much higher spread
86+
# than in case of heart beat check
87+
88+
# load connection cache
89+
if os.path.isfile(cache_path):
90+
with open(cache_path, "r") as cch:
91+
conns = json.load(cch)
92+
else:
93+
conns = dict()
94+
95+
# update connection cache values
96+
processes = [proc for proc in psutil.process_iter()
97+
if process_name in proc.name()]
98+
if not processes:
99+
return 1, f"Given process {process_name} was not found"
100+
ports = set(ports)
101+
for p in processes:
102+
conn_method = getattr(p, "net_connections", p.connections)
103+
for c in conn_method():
104+
if c.raddr.port not in ports:
105+
continue
106+
key = f"{c.raddr.ip}/{c.raddr.port}"
107+
conns[key] = dict(ts=datetime.datetime.now().timestamp(),
108+
ip=c.raddr.ip,
109+
port=c.raddr.port)
110+
with open(cache_path, "w") as cch:
111+
json.dump(conns, cch)
112+
113+
# check connection timestamps in the cache
114+
limit = datetime.datetime.today() - CONN_SPREAD
115+
for conn in conns.values():
116+
timestamp = datetime.datetime.fromtimestamp(conn["ts"])
117+
if timestamp < limit:
118+
msg = (f"Timestamp of connection to {conn['ip']}"
119+
f" on port {conn['port']} is out of limit")
120+
return 1, msg
121+
122+
return 0, ""
123+
124+
125+
if __name__ == "__main__":
126+
try:
127+
if os.path.isfile(HEARTBEAT_SOCKET):
128+
# verify pollsters' heart beats
129+
rc, reason = check_pollsters(HEARTBEAT_SOCKET)
130+
else:
131+
# mimic heart beats check on process' connections
132+
# to various OpenStack APIs when the heart beat feature
133+
# is not available
134+
rc, reason = check_connection(PROCESS_CONN_CACHE,
135+
PROCESS_NAME,
136+
PROCESS_PORTS)
137+
except Exception as ex:
138+
rc, reason = 2, f"Unkown error: {ex}"
139+
140+
if rc != 0:
141+
print(reason)
142+
sys.exit(rc)
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
# Copyright 2025 Inc.
4+
# All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
7+
# not use this file except in compliance with the License. You may obtain
8+
# a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15+
# License for the specific language governing permissions and limitations
16+
# under the License.
17+
18+
import datetime
19+
import json
20+
import os
21+
import psutil
22+
import sys
23+
24+
25+
CONN_SPREAD = datetime.timedelta(minutes=5)
26+
PROCESS_NAME = "ceilometer-agent-notification"
27+
PROCESS_CONN_CACHE = "/var/lib/ceilometer/tmp/connections"
28+
PROCESS_PORTS = [5672]
29+
30+
31+
def check_connection(cache_path: str,
32+
process_name: str,
33+
ports: list[int]) -> tuple[int, str]:
34+
"""
35+
Returns 0 if given process is or was recently connected
36+
to the given port(s). Otherwise returns 1 or 2 with respective
37+
reason of failure.
38+
"""
39+
# load connection cache
40+
if os.path.isfile(cache_path):
41+
with open(cache_path, "r") as cch:
42+
conns = json.load(cch)
43+
else:
44+
conns = dict()
45+
46+
# update connection cache values
47+
processes = [proc for proc in psutil.process_iter()
48+
if process_name in proc.name()]
49+
if not processes:
50+
return 1, f"Given process {process_name} was not found"
51+
ports = set(ports)
52+
for p in processes:
53+
conn_method = getattr(p, "net_connections", p.connections)
54+
for c in conn_method():
55+
if c.raddr.port not in ports:
56+
continue
57+
key = f"{c.raddr.ip}/{c.raddr.port}"
58+
conns[key] = dict(ts=datetime.datetime.now().timestamp(),
59+
ip=c.raddr.ip,
60+
port=c.raddr.port)
61+
with open(cache_path, "w") as cch:
62+
json.dump(conns, cch)
63+
64+
# check connection timestamps in the cache
65+
limit = datetime.datetime.today() - CONN_SPREAD
66+
for conn in conns.values():
67+
timestamp = datetime.datetime.fromtimestamp(conn["ts"])
68+
if timestamp < limit:
69+
msg = (f"Timestamp of connection to {conn['ip']}"
70+
f" on port {conn['port']} is out of limit")
71+
return 1, msg
72+
73+
return 0, ""
74+
75+
76+
if __name__ == "__main__":
77+
try:
78+
rc, reason = check_connection(PROCESS_CONN_CACHE,
79+
PROCESS_NAME,
80+
PROCESS_PORTS)
81+
except Exception as ex:
82+
rc, reason = 2, f"Unkown error: {ex}"
83+
84+
if rc != 0:
85+
print(reason)
86+
sys.exit(rc)

0 commit comments

Comments
 (0)