Skip to content

Commit 34f209f

Browse files
committed
crimson/mgr/client.cc: daemon_health_metrics support
Fixes: https://tracker.ceph.com/issues/63766 Signed-off-by: junxiang Mu <[email protected]>
1 parent 27fd389 commit 34f209f

File tree

6 files changed

+94
-3
lines changed

6 files changed

+94
-3
lines changed

src/crimson/mgr/client.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,13 @@ void Client::report()
174174
});
175175
}
176176

177+
void Client::update_daemon_health(std::vector<DaemonHealthMetric>&& metrics)
178+
{
179+
daemon_health_metrics = std::move(metrics);
180+
}
181+
177182
void Client::_send_report()
178183
{
179-
// TODO: implement daemon_health_metrics support
180-
// https://tracker.ceph.com/issues/63766
181184
gates.dispatch_in_background(__func__, *this, [this] {
182185
if (!conn) {
183186
logger().warn("cannot send report; no conn available");
@@ -196,6 +199,7 @@ void Client::_send_report()
196199
report->daemon_name = local_conf()->name.get_id();
197200
}
198201
report->service_name = service_name;
202+
report->daemon_health_metrics = std::move(daemon_health_metrics);
199203
local_conf().get_config_bl(last_config_bl_version, &report->config_bl,
200204
&last_config_bl_version);
201205
return conn->send(std::move(report));

src/crimson/mgr/client.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "crimson/common/gated.h"
99
#include "crimson/net/Dispatcher.h"
1010
#include "crimson/net/Fwd.h"
11+
#include "mgr/DaemonHealthMetric.h"
1112
#include "mon/MgrMap.h"
1213

1314
template<typename Message> using Ref = boost::intrusive_ptr<Message>;
@@ -35,6 +36,7 @@ class Client : public crimson::net::Dispatcher {
3536
seastar::future<> start();
3637
seastar::future<> stop();
3738
void report();
39+
void update_daemon_health(std::vector<DaemonHealthMetric>&& metrics);
3840

3941
private:
4042
std::optional<seastar::future<>> ms_dispatch(
@@ -59,6 +61,8 @@ class Client : public crimson::net::Dispatcher {
5961
uint64_t last_config_bl_version = 0;
6062
std::string service_name, daemon_name;
6163

64+
std::vector<DaemonHealthMetric> daemon_health_metrics;
65+
6266
void _send_report();
6367
};
6468

src/crimson/osd/osd.cc

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ OSD::OSD(int id, uint32_t nonce,
105105
std::ignore = update_heartbeat_peers(
106106
).then([this] {
107107
update_stats();
108+
mgrc->update_daemon_health(get_health_metrics());
108109
tick_timer.arm(
109110
std::chrono::seconds(TICK_INTERVAL));
110111
});
@@ -976,7 +977,7 @@ void OSD::handle_conf_change(
976977
const crimson::common::ConfigProxy& conf,
977978
const std::set <std::string> &changed)
978979
{
979-
if (changed.count("osd_beacon_report_interval")) {
980+
if (changed.contains("osd_beacon_report_interval")) {
980981
beacon_timer.rearm_periodic(
981982
std::chrono::seconds(conf->osd_beacon_report_interval));
982983
}
@@ -1415,6 +1416,74 @@ seastar::future<> OSD::handle_recovery_subreq(
14151416
conn, std::move(m)).second;
14161417
}
14171418

1419+
vector<DaemonHealthMetric> OSD::get_health_metrics()
1420+
{
1421+
LOG_PREFIX(OSD::get_health_metrics);
1422+
vector<DaemonHealthMetric> metrics;
1423+
1424+
const utime_t now = ceph_clock_now();
1425+
utime_t oldest_secs = now;
1426+
utime_t too_old = now;
1427+
too_old -= local_conf()->osd_op_complaint_time;
1428+
int slow = 0;
1429+
ClientRequest::ICRef oldest_op;
1430+
map<uint64_t, int> slow_op_pools;
1431+
bool log_aggregated_slow_op = local_conf()->osd_aggregated_slow_ops_logging;
1432+
auto count_slow_ops = [&](const ClientRequest& op) {
1433+
if (op.get_started() < too_old) {
1434+
std::stringstream ss;
1435+
ss << "slow request ";
1436+
op.print(ss);
1437+
ss << " initiated "
1438+
<< op.get_started();
1439+
WARN("{}", ss.str());
1440+
if (log_aggregated_slow_op) {
1441+
uint64_t pool_id = op.get_pgid().pgid.m_pool;
1442+
if (pool_id > 0 && pool_id <= (uint64_t) osdmap->get_pool_max()) {
1443+
slow_op_pools[pool_id]++;
1444+
}
1445+
} else {
1446+
clog->warn() << ss.str();
1447+
}
1448+
++slow;
1449+
if (!oldest_op || op.get_started() < oldest_op->get_started()) {
1450+
oldest_op = &op;
1451+
}
1452+
}
1453+
};
1454+
1455+
auto& op_registry = get_shard_services().get_registry();
1456+
op_registry.visit_ops_in_flight(count_slow_ops);
1457+
if (slow) {
1458+
std::stringstream ss;
1459+
ss << __func__ << " reporting " << slow << " slow ops, oldest is ";
1460+
ceph_assert(oldest_op);
1461+
oldest_op->print(ss);
1462+
ERROR("{}", ss.str());
1463+
if (log_aggregated_slow_op && !slow_op_pools.empty()) {
1464+
std::stringstream ss;
1465+
auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
1466+
[](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
1467+
return p1.second < p2.second;
1468+
});
1469+
if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
1470+
string pool_name = osdmap->get_pool_name(slow_pool_it->first);
1471+
ss << "slow requests (most affected pool [ '"
1472+
<< pool_name
1473+
<< "' : "
1474+
<< slow_pool_it->second
1475+
<< " ])";
1476+
}
1477+
WARN("{}", ss.str());
1478+
clog->warn() << ss.str();
1479+
}
1480+
oldest_secs = now - oldest_op->get_started();
1481+
metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
1482+
}
1483+
1484+
return metrics;
1485+
}
1486+
14181487
bool OSD::should_restart() const
14191488
{
14201489
LOG_PREFIX(OSD::should_restart);

src/crimson/osd/osd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ class OSD final : public crimson::net::Dispatcher,
234234
crimson::net::ConnectionRef conn,
235235
Ref<MOSDPGUpdateLogMissingReply> m);
236236

237+
std::vector<DaemonHealthMetric> get_health_metrics();
238+
237239
private:
238240
crimson::common::gate_per_shard gate;
239241

src/crimson/osd/osd_operation.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,17 @@ size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatt
136136
return ops_count;
137137
}
138138

139+
void OSDOperationRegistry::visit_ops_in_flight(std::function<void(const ClientRequest&)>&& visit)
140+
{
141+
const auto& client_registry =
142+
get_registry<static_cast<size_t>(OperationTypeCode::client_request)>();
143+
auto it = std::begin(client_registry);
144+
for (; it != std::end(client_registry); ++it) {
145+
const auto& fastest_historic_op = static_cast<const ClientRequest&>(*it);
146+
visit(fastest_historic_op);
147+
}
148+
}
149+
139150
OperationThrottler::OperationThrottler(ConfigProxy &conf)
140151
: scheduler(crimson::osd::scheduler::make_scheduler(conf))
141152
{

src/crimson/osd/osd_operation.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ struct OSDOperationRegistry : OperationRegistryT<
265265

266266
size_t dump_historic_client_requests(ceph::Formatter* f) const;
267267
size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const;
268+
void visit_ops_in_flight(std::function<void(const ClientRequest&)>&& visit);
268269

269270
private:
270271
size_t num_recent_ops = 0;

0 commit comments

Comments
 (0)