@@ -105,6 +105,7 @@ OSD::OSD(int id, uint32_t nonce,
105105 std::ignore = update_heartbeat_peers (
106106 ).then ([this ] {
107107 update_stats ();
108+ mgrc->update_daemon_health (get_health_metrics ());
108109 tick_timer.arm (
109110 std::chrono::seconds (TICK_INTERVAL));
110111 });
@@ -976,7 +977,7 @@ void OSD::handle_conf_change(
976977 const crimson::common::ConfigProxy& conf,
977978 const std::set <std::string> &changed)
978979{
979- if (changed.count (" osd_beacon_report_interval" )) {
980+ if (changed.contains (" osd_beacon_report_interval" )) {
980981 beacon_timer.rearm_periodic (
981982 std::chrono::seconds (conf->osd_beacon_report_interval ));
982983 }
@@ -1415,6 +1416,74 @@ seastar::future<> OSD::handle_recovery_subreq(
14151416 conn, std::move (m)).second ;
14161417}
14171418
1419+ vector<DaemonHealthMetric> OSD::get_health_metrics ()
1420+ {
1421+ LOG_PREFIX (OSD::get_health_metrics);
1422+ vector<DaemonHealthMetric> metrics;
1423+
1424+ const utime_t now = ceph_clock_now ();
1425+ utime_t oldest_secs = now;
1426+ utime_t too_old = now;
1427+ too_old -= local_conf ()->osd_op_complaint_time ;
1428+ int slow = 0 ;
1429+ ClientRequest::ICRef oldest_op;
1430+ map<uint64_t , int > slow_op_pools;
1431+ bool log_aggregated_slow_op = local_conf ()->osd_aggregated_slow_ops_logging ;
1432+ auto count_slow_ops = [&](const ClientRequest& op) {
1433+ if (op.get_started () < too_old) {
1434+ std::stringstream ss;
1435+ ss << " slow request " ;
1436+ op.print (ss);
1437+ ss << " initiated "
1438+ << op.get_started ();
1439+ WARN (" {}" , ss.str ());
1440+ if (log_aggregated_slow_op) {
1441+ uint64_t pool_id = op.get_pgid ().pgid .m_pool ;
1442+ if (pool_id > 0 && pool_id <= (uint64_t ) osdmap->get_pool_max ()) {
1443+ slow_op_pools[pool_id]++;
1444+ }
1445+ } else {
1446+ clog->warn () << ss.str ();
1447+ }
1448+ ++slow;
1449+ if (!oldest_op || op.get_started () < oldest_op->get_started ()) {
1450+ oldest_op = &op;
1451+ }
1452+ }
1453+ };
1454+
1455+ auto & op_registry = get_shard_services ().get_registry ();
1456+ op_registry.visit_ops_in_flight (count_slow_ops);
1457+ if (slow) {
1458+ std::stringstream ss;
1459+ ss << __func__ << " reporting " << slow << " slow ops, oldest is " ;
1460+ ceph_assert (oldest_op);
1461+ oldest_op->print (ss);
1462+ ERROR (" {}" , ss.str ());
1463+ if (log_aggregated_slow_op && !slow_op_pools.empty ()) {
1464+ std::stringstream ss;
1465+ auto slow_pool_it = std::max_element (slow_op_pools.begin (), slow_op_pools.end (),
1466+ [](std::pair<uint64_t , int > p1, std::pair<uint64_t , int > p2) {
1467+ return p1.second < p2.second ;
1468+ });
1469+ if (osdmap->get_pools ().find (slow_pool_it->first ) != osdmap->get_pools ().end ()) {
1470+ string pool_name = osdmap->get_pool_name (slow_pool_it->first );
1471+ ss << " slow requests (most affected pool [ '"
1472+ << pool_name
1473+ << " ' : "
1474+ << slow_pool_it->second
1475+ << " ])" ;
1476+ }
1477+ WARN (" {}" , ss.str ());
1478+ clog->warn () << ss.str ();
1479+ }
1480+ oldest_secs = now - oldest_op->get_started ();
1481+ metrics.emplace_back (daemon_metric::SLOW_OPS, slow, oldest_secs);
1482+ }
1483+
1484+ return metrics;
1485+ }
1486+
14181487bool OSD::should_restart () const
14191488{
14201489 LOG_PREFIX (OSD::should_restart);
0 commit comments