Skip to content

Commit 99aa9e0

Browse files
authored
Merge pull request ceph#60167 from jmolmo/add_daemon_health_metric
exporter: New metric for report ceph daemons health
2 parents 1e4f788 + 3c9b07e commit 99aa9e0

File tree

4 files changed

+141
-4
lines changed

4 files changed

+141
-4
lines changed

doc/monitoring/index.rst

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,30 @@ in:
6464

6565
It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard.
6666

67+
Ceph daemon health metrics
68+
==========================
69+
70+
The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket.
71+
72+
The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons.
73+
74+
Labels:
75+
- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host.
76+
- **``hostname``**: Name of the host where the Ceph daemon is running.
77+
78+
Example:
79+
80+
.. code-block:: bash
81+
82+
ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1
83+
ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0
84+
85+
To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression:
86+
87+
.. code-block:: bash
88+
89+
ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0
90+
6791
6892
Performance metrics
6993
===================

src/exporter/DaemonMetricCollector.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter
168168
if (sockClientsPing) {
169169
bool ok;
170170
sock_client.ping(&ok);
171+
std::string ceph_daemon_socket_up_desc(
172+
"Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy).");
173+
labels_t ceph_daemon_socket_up_labels;
174+
ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname());
175+
ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name);
176+
add_metric(builder, static_cast<int>(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc,
177+
"gauge", ceph_daemon_socket_up_labels);
171178
if (!ok) {
172179
failures++;
173180
continue;
174-
}
181+
}
175182
}
176183
std::string counter_dump_response = dump_response.size() > 0 ? dump_response :
177184
asok_request(sock_client, "counter dump", daemon_name);

src/exporter/DaemonMetricCollector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ class DaemonMetricCollector {
4242
std::map<std::string, AdminSocketClient> clients;
4343
std::string metrics;
4444
std::pair<labels_t, std::string> add_fixed_name_metrics(std::string metric_name);
45+
void update_sockets();
4546

4647
private:
4748
std::mutex metrics_mutex;
4849
std::unique_ptr<MetricsBuilder> builder;
49-
void update_sockets();
5050
void request_loop(boost::asio::steady_timer &timer);
5151

5252
void dump_asok_metric(boost::json::object perf_info,

src/test/exporter/test_exporter.cc

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
#include "common/ceph_argparse.h"
22
#include "common/config.h"
33
#include "common/config_proxy.h"
4+
#include "common/admin_socket.h"
5+
#include "common/admin_socket_client.h"
46
#include <gmock/gmock.h>
57
#include "gtest/gtest.h"
68
#include "common/ceph_context.h"
79
#include "global/global_context.h"
810
#include "global/global_init.h"
911
#include "exporter/util.h"
1012
#include "exporter/DaemonMetricCollector.h"
13+
#include <filesystem>
1114

1215
#include <regex>
1316
#include <string>
@@ -674,6 +677,27 @@ static std::vector<std::pair<std::string, std::string>> promethize_data = {
674677
{"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"}
675678
};
676679

680+
681+
class AdminSocketTest
682+
{
683+
public:
684+
explicit AdminSocketTest(AdminSocket *asokc)
685+
: m_asokc(asokc)
686+
{
687+
}
688+
bool init(const std::string &uri) {
689+
return m_asokc->init(uri);
690+
}
691+
std::string bind_and_listen(const std::string &sock_path, int *fd) {
692+
return m_asokc->bind_and_listen(sock_path, fd);
693+
}
694+
bool shutdown() {
695+
m_asokc->shutdown();
696+
return true;
697+
}
698+
AdminSocket *m_asokc;
699+
};
700+
677701
int main(int argc, char **argv)
678702
{
679703
::testing::InitGoogleTest(&argc, argv);
@@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577
12891313
# TYPE ceph_mon_session_trim counter
12901314
ceph_mon_session_trim{ceph_daemon="mon.a"} 9
12911315
)";
1292-
1293-
ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
1316+
1317+
std::string actualMetrics = collector.metrics;
1318+
std::cout << "Actual MON Metrics: " << actualMetrics << std::endl;
1319+
ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos);
1320+
//ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
12941321

12951322
// Test for labeled metrics - RGW
12961323
daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064";
@@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) {
14521479
EXPECT_EQ(new_metric.first, expected_labels);
14531480
ASSERT_TRUE(new_metric.second == expected_metric_name);
14541481
}
1482+
1483+
TEST(Exporter, UpdateSockets) {
1484+
const std::string mock_dir = "/tmp/fake_sock_dir";
1485+
1486+
// Create the mock directory
1487+
std::filesystem::create_directories(mock_dir);
1488+
1489+
// Create a mix of vstart and real cluster mock .asok files
1490+
std::ofstream(mock_dir + "/ceph-osd.0.asok").close();
1491+
std::ofstream(mock_dir + "/ceph-mds.a.asok").close();
1492+
std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close();
1493+
std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close();
1494+
std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close();
1495+
std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close();
1496+
1497+
g_conf().set_val("exporter_sock_dir", mock_dir);
1498+
1499+
DaemonMetricCollector collector;
1500+
1501+
// Run the function that interacts with the mock directory
1502+
collector.update_sockets();
1503+
1504+
// Verify the expected results
1505+
ASSERT_EQ(collector.clients.size(), 4);
1506+
ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end());
1507+
ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end());
1508+
ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end());
1509+
ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end());
1510+
1511+
1512+
// Remove the mock directory and files
1513+
std::filesystem::remove_all(mock_dir);
1514+
}
1515+
1516+
1517+
TEST(Exporter, HealthMetrics) {
1518+
std::map<std::string, AdminSocketClient> clients;
1519+
DaemonMetricCollector &collector = collector_instance();
1520+
std::string daemon = "test_daemon";
1521+
std::string expectedCounterDump = "";
1522+
std::string expectedCounterSchema = "";
1523+
std::string metricName = "ceph_daemon_socket_up";
1524+
1525+
// Fake admin socket
1526+
std::string asok_path = "/tmp/" + daemon + ".asok";
1527+
std::unique_ptr<AdminSocket> asokc = std::make_unique<AdminSocket>(g_ceph_context);
1528+
AdminSocketClient client(asok_path);
1529+
1530+
// Add the daemon clients to the collector
1531+
clients.insert({daemon, std::move(client)});
1532+
collector.clients = clients;
1533+
1534+
auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) {
1535+
collector.metrics = "";
1536+
1537+
if (shouldInitializeSocket) {
1538+
AdminSocketTest asoct(asokc.get());
1539+
ASSERT_TRUE(asoct.init(asok_path));
1540+
}
1541+
1542+
collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false);
1543+
1544+
if (shouldInitializeSocket) {
1545+
AdminSocketTest asoct(asokc.get());
1546+
ASSERT_TRUE(asoct.shutdown());
1547+
}
1548+
1549+
std::string retrievedMetrics = collector.metrics;
1550+
std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)";
1551+
std::regex regexPattern(pattern);
1552+
ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern));
1553+
};
1554+
1555+
// Test an admin socket not answering: metric value should be "0"
1556+
verifyMetricValue("0", false);
1557+
1558+
// Test an admin socket answering: metric value should be "1"
1559+
verifyMetricValue("1", true);
1560+
}

0 commit comments

Comments
 (0)