Skip to content

Commit ce91e23

Browse files
authored
Merge pull request ceph#56446 from guojidan/metrics
crimson/mgr/client.cc: daemon_health_metrics support Reviewed-by: Matan Breizman <[email protected]>
2 parents f5ef9a5 + 25ff18b commit ce91e23

File tree

8 files changed

+110
-3
lines changed

8 files changed

+110
-3
lines changed

src/crimson/mgr/client.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,13 @@ void Client::report()
174174
});
175175
}
176176

177+
void Client::update_daemon_health(std::vector<DaemonHealthMetric>&& metrics)
178+
{
179+
daemon_health_metrics = std::move(metrics);
180+
}
181+
177182
void Client::_send_report()
178183
{
179-
// TODO: implement daemon_health_metrics support
180-
// https://tracker.ceph.com/issues/63766
181184
gates.dispatch_in_background(__func__, *this, [this] {
182185
if (!conn) {
183186
logger().warn("cannot send report; no conn available");
@@ -196,6 +199,7 @@ void Client::_send_report()
196199
report->daemon_name = local_conf()->name.get_id();
197200
}
198201
report->service_name = service_name;
202+
report->daemon_health_metrics = std::move(daemon_health_metrics);
199203
local_conf().get_config_bl(last_config_bl_version, &report->config_bl,
200204
&last_config_bl_version);
201205
return conn->send(std::move(report));

src/crimson/mgr/client.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "crimson/common/gated.h"
99
#include "crimson/net/Dispatcher.h"
1010
#include "crimson/net/Fwd.h"
11+
#include "mgr/DaemonHealthMetric.h"
1112
#include "mon/MgrMap.h"
1213

1314
template<typename Message> using Ref = boost::intrusive_ptr<Message>;
@@ -35,6 +36,7 @@ class Client : public crimson::net::Dispatcher {
3536
seastar::future<> start();
3637
seastar::future<> stop();
3738
void report();
39+
void update_daemon_health(std::vector<DaemonHealthMetric>&& metrics);
3840

3941
private:
4042
std::optional<seastar::future<>> ms_dispatch(
@@ -59,6 +61,8 @@ class Client : public crimson::net::Dispatcher {
5961
uint64_t last_config_bl_version = 0;
6062
std::string service_name, daemon_name;
6163

64+
std::vector<DaemonHealthMetric> daemon_health_metrics;
65+
6266
void _send_report();
6367
};
6468

src/crimson/osd/osd.cc

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ OSD::OSD(int id, uint32_t nonce,
105105
std::ignore = update_heartbeat_peers(
106106
).then([this] {
107107
update_stats();
108+
mgrc->update_daemon_health(get_health_metrics());
108109
tick_timer.arm(
109110
std::chrono::seconds(TICK_INTERVAL));
110111
});
@@ -976,7 +977,7 @@ void OSD::handle_conf_change(
976977
const crimson::common::ConfigProxy& conf,
977978
const std::set <std::string> &changed)
978979
{
979-
if (changed.count("osd_beacon_report_interval")) {
980+
if (changed.contains("osd_beacon_report_interval")) {
980981
beacon_timer.rearm_periodic(
981982
std::chrono::seconds(conf->osd_beacon_report_interval));
982983
}
@@ -1415,6 +1416,74 @@ seastar::future<> OSD::handle_recovery_subreq(
14151416
conn, std::move(m)).second;
14161417
}
14171418

1419+
vector<DaemonHealthMetric> OSD::get_health_metrics()
1420+
{
1421+
LOG_PREFIX(OSD::get_health_metrics);
1422+
vector<DaemonHealthMetric> metrics;
1423+
1424+
const utime_t now = ceph_clock_now();
1425+
utime_t oldest_secs = now;
1426+
utime_t too_old = now;
1427+
too_old -= local_conf()->osd_op_complaint_time;
1428+
int slow = 0;
1429+
ClientRequest::ICRef oldest_op;
1430+
map<uint64_t, int> slow_op_pools;
1431+
bool log_aggregated_slow_op = local_conf()->osd_aggregated_slow_ops_logging;
1432+
auto count_slow_ops = [&](const ClientRequest& op) {
1433+
if (op.get_started() < too_old) {
1434+
std::stringstream ss;
1435+
ss << "slow request ";
1436+
op.print(ss);
1437+
ss << " initiated "
1438+
<< op.get_started();
1439+
WARN("{}", ss.str());
1440+
if (log_aggregated_slow_op) {
1441+
uint64_t pool_id = op.get_pgid().pgid.m_pool;
1442+
if (pool_id > 0 && pool_id <= (uint64_t) osdmap->get_pool_max()) {
1443+
slow_op_pools[pool_id]++;
1444+
}
1445+
} else {
1446+
clog->warn() << ss.str();
1447+
}
1448+
++slow;
1449+
if (!oldest_op || op.get_started() < oldest_op->get_started()) {
1450+
oldest_op = &op;
1451+
}
1452+
}
1453+
};
1454+
1455+
auto& op_registry = get_shard_services().get_registry();
1456+
op_registry.visit_ops_in_flight(count_slow_ops);
1457+
if (slow) {
1458+
std::stringstream ss;
1459+
ss << __func__ << " reporting " << slow << " slow ops, oldest is ";
1460+
ceph_assert(oldest_op);
1461+
oldest_op->print(ss);
1462+
ERROR("{}", ss.str());
1463+
if (log_aggregated_slow_op && !slow_op_pools.empty()) {
1464+
std::stringstream ss;
1465+
auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
1466+
[](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
1467+
return p1.second < p2.second;
1468+
});
1469+
if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
1470+
string pool_name = osdmap->get_pool_name(slow_pool_it->first);
1471+
ss << "slow requests (most affected pool [ '"
1472+
<< pool_name
1473+
<< "' : "
1474+
<< slow_pool_it->second
1475+
<< " ])";
1476+
}
1477+
WARN("{}", ss.str());
1478+
clog->warn() << ss.str();
1479+
}
1480+
oldest_secs = now - oldest_op->get_started();
1481+
metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
1482+
}
1483+
1484+
return metrics;
1485+
}
1486+
14181487
bool OSD::should_restart() const
14191488
{
14201489
LOG_PREFIX(OSD::should_restart);

src/crimson/osd/osd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ class OSD final : public crimson::net::Dispatcher,
234234
crimson::net::ConnectionRef conn,
235235
Ref<MOSDPGUpdateLogMissingReply> m);
236236

237+
std::vector<DaemonHealthMetric> get_health_metrics();
238+
237239
private:
238240
crimson::common::gate_per_shard gate;
239241

src/crimson/osd/osd_operation.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,17 @@ size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatt
136136
return ops_count;
137137
}
138138

139+
void OSDOperationRegistry::visit_ops_in_flight(std::function<void(const ClientRequest&)>&& visit)
140+
{
141+
const auto& client_registry =
142+
get_registry<static_cast<size_t>(OperationTypeCode::client_request)>();
143+
auto it = std::begin(client_registry);
144+
for (; it != std::end(client_registry); ++it) {
145+
const auto& fastest_historic_op = static_cast<const ClientRequest&>(*it);
146+
visit(fastest_historic_op);
147+
}
148+
}
149+
139150
OperationThrottler::OperationThrottler(ConfigProxy &conf)
140151
: scheduler(crimson::osd::scheduler::make_scheduler(conf))
141152
{

src/crimson/osd/osd_operation.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ struct OSDOperationRegistry : OperationRegistryT<
280280

281281
size_t dump_historic_client_requests(ceph::Formatter* f) const;
282282
size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const;
283+
void visit_ops_in_flight(std::function<void(const ClientRequest&)>&& visit);
283284

284285
private:
285286
size_t num_recent_ops = 0;

src/crimson/osd/osd_operations/client_request.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,9 @@ ClientRequest::do_process(
450450
co_return;
451451
}
452452
}
453+
454+
co_await maybe_inject_delay();
455+
453456
if (m->get_oid().name.size()
454457
> crimson::common::local_conf()->osd_max_object_name_len) {
455458
co_await reply_op_error(pg, -ENAMETOOLONG);

src/crimson/osd/osd_operations/client_request.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#pragma once
55

66
#include <seastar/core/future.hh>
7+
#include <seastar/core/sleep.hh>
78

89
#include <boost/intrusive/list.hpp>
910
#include <boost/intrusive_ptr.hpp>
@@ -308,6 +309,18 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>,
308309
};
309310

310311
void put_historic() const;
312+
static interruptible_future<> maybe_inject_delay() {
313+
if (common::local_conf()->osd_debug_inject_dispatch_delay_probability > 0) {
314+
if (rand() % 10000 <
315+
common::local_conf()->osd_debug_inject_dispatch_delay_probability * 10000) {
316+
auto delay_duration = std::chrono::duration<double>(
317+
common::local_conf()->osd_debug_inject_dispatch_delay_duration);
318+
auto a_while = std::chrono::duration_cast<std::chrono::seconds>(delay_duration);
319+
return seastar::sleep(a_while);
320+
}
321+
}
322+
return seastar::now();
323+
}
311324
};
312325

313326
}

0 commit comments

Comments
 (0)